This is page 2 of 9. Use http://codebase.md/datalab-to/marker?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .github
│ ├── ISSUE_TEMPLATE
│ │ ├── breaking-bug-report.md
│ │ ├── feature_request.md
│ │ └── output-bug-report.md
│ └── workflows
│ ├── benchmarks.yml
│ ├── ci.yml
│ ├── cla.yml
│ ├── publish.yml
│ └── scripts.yml
├── .gitignore
├── .pre-commit-config.yaml
├── benchmarks
│ ├── __init__.py
│ ├── overall
│ │ ├── __init__.py
│ │ ├── display
│ │ │ ├── __init__.py
│ │ │ ├── dataset.py
│ │ │ └── table.py
│ │ ├── download
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── llamaparse.py
│ │ │ ├── main.py
│ │ │ ├── mathpix.py
│ │ │ └── mistral.py
│ │ ├── elo.py
│ │ ├── methods
│ │ │ ├── __init__.py
│ │ │ ├── docling.py
│ │ │ ├── gt.py
│ │ │ ├── llamaparse.py
│ │ │ ├── marker.py
│ │ │ ├── mathpix.py
│ │ │ ├── mistral.py
│ │ │ ├── olmocr.py
│ │ │ └── schema.py
│ │ ├── overall.py
│ │ ├── registry.py
│ │ ├── schema.py
│ │ └── scorers
│ │ ├── __init__.py
│ │ ├── clean.py
│ │ ├── heuristic.py
│ │ ├── llm.py
│ │ └── schema.py
│ ├── table
│ │ ├── __init__.py
│ │ ├── gemini.py
│ │ ├── inference.py
│ │ ├── scoring.py
│ │ └── table.py
│ ├── throughput
│ │ ├── __init__.py
│ │ └── main.py
│ └── verify_scores.py
├── chunk_convert.py
├── CLA.md
├── convert_single.py
├── convert.py
├── data
│ ├── .gitignore
│ ├── examples
│ │ ├── json
│ │ │ ├── multicolcnn.json
│ │ │ ├── switch_trans.json
│ │ │ └── thinkpython.json
│ │ └── markdown
│ │ ├── multicolcnn
│ │ │ ├── _page_1_Figure_0.jpeg
│ │ │ ├── _page_2_Picture_0.jpeg
│ │ │ ├── _page_6_Figure_0.jpeg
│ │ │ ├── _page_7_Figure_0.jpeg
│ │ │ ├── multicolcnn_meta.json
│ │ │ └── multicolcnn.md
│ │ ├── switch_transformers
│ │ │ ├── _page_11_Figure_4.jpeg
│ │ │ ├── _page_12_Figure_4.jpeg
│ │ │ ├── _page_13_Figure_2.jpeg
│ │ │ ├── _page_18_Figure_1.jpeg
│ │ │ ├── _page_18_Figure_3.jpeg
│ │ │ ├── _page_2_Figure_3.jpeg
│ │ │ ├── _page_20_Figure_1.jpeg
│ │ │ ├── _page_20_Figure_4.jpeg
│ │ │ ├── _page_27_Figure_1.jpeg
│ │ │ ├── _page_29_Figure_1.jpeg
│ │ │ ├── _page_30_Figure_1.jpeg
│ │ │ ├── _page_31_Figure_3.jpeg
│ │ │ ├── _page_4_Figure_1.jpeg
│ │ │ ├── _page_5_Figure_3.jpeg
│ │ │ ├── switch_trans_meta.json
│ │ │ └── switch_trans.md
│ │ └── thinkpython
│ │ ├── _page_109_Figure_1.jpeg
│ │ ├── _page_115_Figure_1.jpeg
│ │ ├── _page_116_Figure_3.jpeg
│ │ ├── _page_127_Figure_1.jpeg
│ │ ├── _page_128_Figure_1.jpeg
│ │ ├── _page_167_Figure_1.jpeg
│ │ ├── _page_169_Figure_1.jpeg
│ │ ├── _page_173_Figure_1.jpeg
│ │ ├── _page_190_Figure_1.jpeg
│ │ ├── _page_195_Figure_1.jpeg
│ │ ├── _page_205_Figure_1.jpeg
│ │ ├── _page_23_Figure_1.jpeg
│ │ ├── _page_23_Figure_3.jpeg
│ │ ├── _page_230_Figure_1.jpeg
│ │ ├── _page_233_Figure_1.jpeg
│ │ ├── _page_233_Figure_3.jpeg
│ │ ├── _page_234_Figure_1.jpeg
│ │ ├── _page_235_Figure_1.jpeg
│ │ ├── _page_236_Figure_1.jpeg
│ │ ├── _page_236_Figure_3.jpeg
│ │ ├── _page_237_Figure_1.jpeg
│ │ ├── _page_238_Figure_1.jpeg
│ │ ├── _page_46_Figure_1.jpeg
│ │ ├── _page_60_Figure_1.jpeg
│ │ ├── _page_60_Figure_3.jpeg
│ │ ├── _page_67_Figure_1.jpeg
│ │ ├── _page_71_Figure_1.jpeg
│ │ ├── _page_78_Figure_1.jpeg
│ │ ├── _page_85_Figure_1.jpeg
│ │ ├── _page_94_Figure_1.jpeg
│ │ ├── _page_99_Figure_17.jpeg
│ │ ├── _page_99_Figure_178.jpeg
│ │ ├── thinkpython_meta.json
│ │ └── thinkpython.md
│ ├── images
│ │ ├── overall.png
│ │ ├── per_doc.png
│ │ └── table.png
│ └── latex_to_md.sh
├── examples
│ ├── marker_modal_deployment.py
│ └── README.md
├── extraction_app.py
├── LICENSE
├── marker
│ ├── builders
│ │ ├── __init__.py
│ │ ├── document.py
│ │ ├── layout.py
│ │ ├── line.py
│ │ ├── ocr.py
│ │ └── structure.py
│ ├── config
│ │ ├── __init__.py
│ │ ├── crawler.py
│ │ ├── parser.py
│ │ └── printer.py
│ ├── converters
│ │ ├── __init__.py
│ │ ├── extraction.py
│ │ ├── ocr.py
│ │ ├── pdf.py
│ │ └── table.py
│ ├── extractors
│ │ ├── __init__.py
│ │ ├── document.py
│ │ └── page.py
│ ├── logger.py
│ ├── models.py
│ ├── output.py
│ ├── processors
│ │ ├── __init__.py
│ │ ├── blank_page.py
│ │ ├── block_relabel.py
│ │ ├── blockquote.py
│ │ ├── code.py
│ │ ├── debug.py
│ │ ├── document_toc.py
│ │ ├── equation.py
│ │ ├── footnote.py
│ │ ├── ignoretext.py
│ │ ├── line_merge.py
│ │ ├── line_numbers.py
│ │ ├── list.py
│ │ ├── llm
│ │ │ ├── __init__.py
│ │ │ ├── llm_complex.py
│ │ │ ├── llm_equation.py
│ │ │ ├── llm_form.py
│ │ │ ├── llm_handwriting.py
│ │ │ ├── llm_image_description.py
│ │ │ ├── llm_mathblock.py
│ │ │ ├── llm_meta.py
│ │ │ ├── llm_page_correction.py
│ │ │ ├── llm_sectionheader.py
│ │ │ ├── llm_table_merge.py
│ │ │ └── llm_table.py
│ │ ├── order.py
│ │ ├── page_header.py
│ │ ├── reference.py
│ │ ├── sectionheader.py
│ │ ├── table.py
│ │ ├── text.py
│ │ └── util.py
│ ├── providers
│ │ ├── __init__.py
│ │ ├── document.py
│ │ ├── epub.py
│ │ ├── html.py
│ │ ├── image.py
│ │ ├── pdf.py
│ │ ├── powerpoint.py
│ │ ├── registry.py
│ │ ├── spreadsheet.py
│ │ └── utils.py
│ ├── renderers
│ │ ├── __init__.py
│ │ ├── chunk.py
│ │ ├── extraction.py
│ │ ├── html.py
│ │ ├── json.py
│ │ ├── markdown.py
│ │ └── ocr_json.py
│ ├── schema
│ │ ├── __init__.py
│ │ ├── blocks
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── basetable.py
│ │ │ ├── caption.py
│ │ │ ├── code.py
│ │ │ ├── complexregion.py
│ │ │ ├── equation.py
│ │ │ ├── figure.py
│ │ │ ├── footnote.py
│ │ │ ├── form.py
│ │ │ ├── handwriting.py
│ │ │ ├── inlinemath.py
│ │ │ ├── listitem.py
│ │ │ ├── pagefooter.py
│ │ │ ├── pageheader.py
│ │ │ ├── picture.py
│ │ │ ├── reference.py
│ │ │ ├── sectionheader.py
│ │ │ ├── table.py
│ │ │ ├── tablecell.py
│ │ │ ├── text.py
│ │ │ └── toc.py
│ │ ├── document.py
│ │ ├── groups
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── figure.py
│ │ │ ├── list.py
│ │ │ ├── page.py
│ │ │ ├── picture.py
│ │ │ └── table.py
│ │ ├── polygon.py
│ │ ├── registry.py
│ │ └── text
│ │ ├── __init__.py
│ │ ├── char.py
│ │ ├── line.py
│ │ └── span.py
│ ├── scripts
│ │ ├── __init__.py
│ │ ├── chunk_convert.py
│ │ ├── chunk_convert.sh
│ │ ├── common.py
│ │ ├── convert_single.py
│ │ ├── convert.py
│ │ ├── extraction_app.py
│ │ ├── file_to_s3.py
│ │ ├── run_streamlit_app.py
│ │ ├── server.py
│ │ └── streamlit_app.py
│ ├── services
│ │ ├── __init__.py
│ │ ├── azure_openai.py
│ │ ├── claude.py
│ │ ├── gemini.py
│ │ ├── ollama.py
│ │ ├── openai.py
│ │ └── vertex.py
│ ├── settings.py
│ ├── util.py
│ └── utils
│ ├── __init__.py
│ ├── batch.py
│ ├── gpu.py
│ └── image.py
├── marker_app.py
├── marker_server.py
├── MODEL_LICENSE
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── README.md
├── signatures
│ └── version1
│ └── cla.json
├── static
│ └── fonts
│ └── .gitignore
└── tests
├── builders
│ ├── test_blank_page.py
│ ├── test_document_builder.py
│ ├── test_garbled_pdf.py
│ ├── test_layout_replace.py
│ ├── test_ocr_builder.py
│ ├── test_ocr_pipeline.py
│ ├── test_overriding.py
│ ├── test_pdf_links.py
│ ├── test_rotated_bboxes.py
│ ├── test_strip_existing_ocr.py
│ └── test_structure.py
├── config
│ └── test_config.py
├── conftest.py
├── converters
│ ├── test_extraction_converter.py
│ ├── test_ocr_converter.py
│ ├── test_pdf_converter.py
│ └── test_table_converter.py
├── processors
│ ├── test_document_toc_processor.py
│ ├── test_equation_processor.py
│ ├── test_footnote_processor.py
│ ├── test_ignoretext.py
│ ├── test_llm_processors.py
│ ├── test_table_merge.py
│ └── test_table_processor.py
├── providers
│ ├── test_document_providers.py
│ ├── test_image_provider.py
│ └── test_pdf_provider.py
├── renderers
│ ├── test_chunk_renderer.py
│ ├── test_extract_images.py
│ ├── test_html_renderer.py
│ ├── test_json_renderer.py
│ └── test_markdown_renderer.py
├── schema
│ └── groups
│ └── test_list_grouping.py
├── services
│ └── test_service_init.py
└── utils.py
```
# Files
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
```toml
1 | [tool.poetry]
2 | name = "marker-pdf"
3 | version = "1.10.1"
4 | description = "Convert documents to markdown with high speed and accuracy."
5 | authors = ["Vik Paruchuri <[email protected]>"]
6 | readme = "README.md"
7 | license = "GPL-3.0-or-later"
8 | repository = "https://github.com/VikParuchuri/marker"
9 | keywords = ["pdf", "markdown", "ocr", "nlp"]
10 | packages = [
11 | {include = "marker"}
12 | ]
13 | include = [
14 | "marker/scripts/*.sh",
15 | "marker/scripts/*.html",
16 | ]
17 |
18 | [tool.poetry.dependencies]
19 | python = "^3.10"
20 | Pillow = "^10.1.0"
21 | pydantic = "^2.4.2"
22 | pydantic-settings = "^2.0.3"
23 | transformers = "^4.45.2"
24 | python-dotenv = "^1.0.0"
25 | torch = "^2.7.0"
26 | tqdm = "^4.66.1"
27 | ftfy = "^6.1.1"
28 | rapidfuzz = "^3.8.1"
29 | surya-ocr = "^0.17.0"
30 | regex = "^2024.4.28"
31 | pdftext = "~0.6.3"
32 | markdownify = "^1.1.0"
33 | click = "^8.2.0"
34 | markdown2 = "^2.5.2"
35 | filetype = "^1.2.0"
36 | google-genai = "^1.0.0"
37 | anthropic = "^0.46.0"
38 | pre-commit = "^4.2.0"
39 | scikit-learn = "^1.6.1"
40 |
41 | # Optional dependencies for documents
42 | mammoth = {version = "^1.9.0", optional = true}
43 | openpyxl = {version = "^3.1.5", optional = true}
44 | python-pptx = {version = "^1.0.2", optional = true}
45 | ebooklib = {version = "^0.18", optional = true}
46 | weasyprint = {version = "^63.1", optional = true}
47 | openai = "^1.65.2"
48 |
49 | [tool.poetry.group.dev.dependencies]
50 | jupyter = "^1.0.0"
51 | datasets = "^2.21.0"
52 | streamlit = "^1.37.1"
53 | fastapi = "^0.115.4"
54 | uvicorn = "^0.32.0"
55 | python-multipart = "^0.0.16"
56 | pytest = "^8.3.3"
57 | pytest-mock = "^3.14.0"
58 | apted = "1.0.3"
59 | distance = "0.1.3"
60 | lxml = "5.3.0"
61 | tabulate = "^0.9.0"
62 | latex2mathml = "^3.77.0"
63 | playwright = "^1.49.1"
64 |
65 | [tool.poetry.extras]
66 | full = ["mammoth", "openpyxl", "python-pptx", "ebooklib", "weasyprint"]
67 |
68 | [tool.poetry.scripts]
69 | marker = "marker.scripts.convert:convert_cli"
70 | marker_single = "marker.scripts.convert_single:convert_single_cli"
71 | marker_chunk_convert = "marker.scripts.chunk_convert:chunk_convert_cli"
72 | marker_gui = "marker.scripts.run_streamlit_app:streamlit_app_cli"
73 | marker_extract = "marker.scripts.run_streamlit_app:extraction_app_cli"
74 | marker_server = "marker.scripts.server:server_cli"
75 |
76 | [build-system]
77 | requires = ["poetry-core"]
78 | build-backend = "poetry.core.masonry.api"
79 |
```
--------------------------------------------------------------------------------
/marker/converters/table.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Tuple, List
2 |
3 | from marker.builders.document import DocumentBuilder
4 | from marker.builders.line import LineBuilder
5 | from marker.builders.ocr import OcrBuilder
6 | from marker.converters.pdf import PdfConverter
7 | from marker.processors import BaseProcessor
8 | from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
9 | from marker.processors.llm.llm_form import LLMFormProcessor
10 | from marker.processors.llm.llm_table import LLMTableProcessor
11 | from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
12 | from marker.processors.table import TableProcessor
13 | from marker.providers.registry import provider_from_filepath
14 | from marker.schema import BlockTypes
15 |
16 |
17 | class TableConverter(PdfConverter):
18 | default_processors: Tuple[BaseProcessor, ...] = (
19 | TableProcessor,
20 | LLMTableProcessor,
21 | LLMTableMergeProcessor,
22 | LLMFormProcessor,
23 | LLMComplexRegionProcessor,
24 | )
25 | converter_block_types: List[BlockTypes] = (
26 | BlockTypes.Table,
27 | BlockTypes.Form,
28 | BlockTypes.TableOfContents,
29 | )
30 |
31 | def build_document(self, filepath: str):
32 | provider_cls = provider_from_filepath(filepath)
33 | layout_builder = self.resolve_dependencies(self.layout_builder_class)
34 | line_builder = self.resolve_dependencies(LineBuilder)
35 | ocr_builder = self.resolve_dependencies(OcrBuilder)
36 | document_builder = DocumentBuilder(self.config)
37 | document_builder.disable_ocr = True
38 |
39 | provider = provider_cls(filepath, self.config)
40 | document = document_builder(provider, layout_builder, line_builder, ocr_builder)
41 |
42 | for page in document.pages:
43 | page.structure = [
44 | p for p in page.structure if p.block_type in self.converter_block_types
45 | ]
46 |
47 | for processor in self.processor_list:
48 | processor(document)
49 |
50 | return document
51 |
52 | def __call__(self, filepath: str):
53 | document = self.build_document(filepath)
54 | self.page_count = len(document.pages)
55 |
56 | renderer = self.resolve_dependencies(self.renderer)
57 | return renderer(document)
58 |
```
--------------------------------------------------------------------------------
/marker/schema/blocks/basetable.py:
--------------------------------------------------------------------------------
```python
1 | from typing import List
2 |
3 | from marker.schema import BlockTypes
4 | from marker.schema.blocks import Block, BlockOutput
5 | from marker.schema.blocks.tablecell import TableCell
6 |
7 |
8 | class BaseTable(Block):
9 | block_type: BlockTypes | None = None
10 | html: str | None = None
11 |
12 | @staticmethod
13 | def format_cells(
14 | document, child_blocks, block_config, child_cells: List[TableCell] | None = None
15 | ):
16 | if child_cells is None:
17 | child_cells: List[TableCell] = [
18 | document.get_block(c.id)
19 | for c in child_blocks
20 | if c.id.block_type == BlockTypes.TableCell
21 | ]
22 |
23 | unique_rows = sorted(list(set([c.row_id for c in child_cells])))
24 | html_repr = "<table><tbody>"
25 | for row_id in unique_rows:
26 | row_cells = sorted(
27 | [c for c in child_cells if c.row_id == row_id], key=lambda x: x.col_id
28 | )
29 | html_repr += "<tr>"
30 | for cell in row_cells:
31 | html_repr += cell.assemble_html(
32 | document, child_blocks, None, block_config
33 | )
34 | html_repr += "</tr>"
35 | html_repr += "</tbody></table>"
36 | return html_repr
37 |
38 | def assemble_html(
39 | self,
40 | document,
41 | child_blocks: List[BlockOutput],
42 | parent_structure=None,
43 | block_config: dict | None = None,
44 | ):
45 | # Filter out the table cells, so they don't render twice
46 | child_ref_blocks = [
47 | block
48 | for block in child_blocks
49 | if block.id.block_type == BlockTypes.Reference
50 | ]
51 | template = super().assemble_html(
52 | document, child_ref_blocks, parent_structure, block_config
53 | )
54 |
55 | child_block_types = set([c.id.block_type for c in child_blocks])
56 | if self.html:
57 | # LLM processor
58 | return template + self.html
59 | elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types:
60 | # Table processor
61 | return template + self.format_cells(document, child_blocks, block_config)
62 | else:
63 | # Default text lines and spans
64 | return f"<p>{template}</p>"
65 |
```
--------------------------------------------------------------------------------
/benchmarks/overall/download/mathpix.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | import time
3 |
4 | import requests
5 |
6 | from benchmarks.overall.download.base import Downloader
7 |
8 |
9 | class MathpixDownloader(Downloader):
10 | service = "mathpix"
11 |
12 | def get_html(self, pdf_bytes):
13 | headers = {
14 | "app_id": self.app_id,
15 | "app_key": self.api_key,
16 | }
17 | start = time.time()
18 | pdf_id = mathpix_request(pdf_bytes, headers)
19 | status = mathpix_status(pdf_id, headers)
20 | if status in ["processing", "error"]:
21 | md = ""
22 | else:
23 | md = mathpix_results(pdf_id, headers)
24 | end = time.time()
25 | if isinstance(md, bytes):
26 | md = md.decode("utf-8")
27 |
28 | return {
29 | "md": md,
30 | "time": end - start
31 | }
32 |
33 | def mathpix_request(buffer, headers):
34 | response = requests.post("https://api.mathpix.com/v3/pdf",
35 | headers=headers,
36 | data={
37 | "options_json": json.dumps(
38 | {
39 | "conversion_formats": {
40 | "md": True,
41 | "html": True
42 | }
43 | }
44 | )
45 | },
46 | files={
47 | "file": buffer
48 | }
49 | )
50 | data = response.json()
51 | pdf_id = data["pdf_id"]
52 | return pdf_id
53 |
54 | def mathpix_status(pdf_id, headers):
55 | max_iters = 120
56 | i = 0
57 | status = "processing"
58 | status2 = "processing"
59 | while i < max_iters:
60 | time.sleep(1)
61 | response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}",
62 | headers=headers
63 | )
64 | status_resp = response.json()
65 | if "conversion_status" not in status_resp:
66 | continue
67 | status = status_resp["conversion_status"]["md"]["status"]
68 | status2 = status_resp["conversion_status"]["html"]["status"]
69 | if status == "completed" and status2 == "completed":
70 | break
71 | elif status == "error" or status2 == "error":
72 | break
73 | out_status = "completed" if status == "completed" and status2 == "completed" else "error"
74 | return out_status
75 |
76 | def mathpix_results(pdf_id, headers, ext="md"):
77 | response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}.{ext}",
78 | headers=headers
79 | )
80 | return response.content
81 |
```
--------------------------------------------------------------------------------
/tests/builders/test_garbled_pdf.py:
--------------------------------------------------------------------------------
```python
1 | import pytest
2 |
3 | from marker.builders.document import DocumentBuilder
4 | from marker.builders.line import LineBuilder
5 | from marker.processors.table import TableProcessor
6 | from marker.schema import BlockTypes
7 |
8 |
9 | @pytest.mark.filename("water_damage.pdf")
10 | def test_garbled_pdf(pdf_document, recognition_model, table_rec_model, detection_model):
11 | assert pdf_document.pages[0].structure[0] == "/page/0/Table/0"
12 |
13 | table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
14 | assert table_block.block_type == BlockTypes.Table
15 | assert table_block.structure[0] == "/page/0/Line/10"
16 |
17 | table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
18 | assert table_cell.block_type == BlockTypes.Line
19 |
20 | # We don't OCR in the initial pass, only with the TableProcessor
21 | processor = TableProcessor(recognition_model, table_rec_model, detection_model)
22 | processor(pdf_document)
23 |
24 | table = pdf_document.pages[0].contained_blocks(pdf_document, (BlockTypes.Table,))[0]
25 | assert "варіант" in table.raw_text(pdf_document)
26 |
27 | table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
28 | assert table_cell.block_type == BlockTypes.TableCell
29 |
30 |
31 | @pytest.mark.filename("hindi_judgement.pdf")
32 | @pytest.mark.config({"page_range": [2, 3], "disable_ocr": True})
33 | def test_garbled_builder(config, doc_provider, detection_model, ocr_error_model):
34 | line_builder = LineBuilder(detection_model, ocr_error_model, config)
35 | builder = DocumentBuilder(config)
36 | document = builder.build_document(doc_provider)
37 |
38 | bad_ocr_results = line_builder.ocr_error_detection(
39 | document.pages, doc_provider.page_lines
40 | )
41 | assert len(bad_ocr_results.labels) == 2
42 | assert any([label == "bad" for label in bad_ocr_results.labels])
43 |
44 |
45 | @pytest.mark.filename("adversarial.pdf")
46 | @pytest.mark.config({"page_range": [2, 3], "disable_ocr": True})
47 | def test_nongarbled_builder(config, doc_provider, detection_model, ocr_error_model):
48 | line_builder = LineBuilder(detection_model, ocr_error_model, config)
49 | builder = DocumentBuilder(config)
50 | document = builder.build_document(doc_provider)
51 |
52 | bad_ocr_results = line_builder.ocr_error_detection(
53 | document.pages, doc_provider.page_lines
54 | )
55 | assert len(bad_ocr_results.labels) == 2
56 | assert all([label == "good" for label in bad_ocr_results.labels])
57 |
```
--------------------------------------------------------------------------------
/marker/processors/order.py:
--------------------------------------------------------------------------------
```python
1 | from statistics import mean
2 | from collections import defaultdict
3 |
4 | from marker.processors import BaseProcessor
5 | from marker.schema import BlockTypes
6 | from marker.schema.document import Document
7 |
8 |
9 | class OrderProcessor(BaseProcessor):
10 | """
11 | A processor for sorting the blocks in order if needed. This can help when the layout image was sliced.
12 | """
13 | block_types = tuple()
14 |
15 | def __call__(self, document: Document):
16 | for page in document.pages:
17 | # Skip OCRed pages
18 | if page.text_extraction_method != "pdftext":
19 | continue
20 |
21 | # Skip pages without layout slicing
22 | if not page.layout_sliced:
23 | continue
24 |
25 | block_idxs = defaultdict(int)
26 | for block_id in page.structure:
27 | block = document.get_block(block_id)
28 | spans = block.contained_blocks(document, (BlockTypes.Span, ))
29 | if len(spans) == 0:
30 | continue
31 |
32 | # Avg span position in original PDF
33 | block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2
34 |
35 | for block_id in page.structure:
36 | # Already assigned block id via span position
37 | if block_idxs[block_id] > 0:
38 | continue
39 |
40 | block = document.get_block(block_id)
41 | prev_block = document.get_prev_block(block)
42 | next_block = document.get_next_block(block)
43 |
44 | block_idx_add = 0
45 | if prev_block:
46 | block_idx_add = 1
47 |
48 | while prev_block and prev_block.id not in block_idxs:
49 | prev_block = document.get_prev_block(prev_block)
50 | block_idx_add += 1
51 |
52 | if not prev_block:
53 | block_idx_add = -1
54 | while next_block and next_block.id not in block_idxs:
55 | next_block = document.get_next_block(next_block)
56 | block_idx_add -= 1
57 |
58 | if not next_block and not prev_block:
59 | pass
60 | elif prev_block:
61 | block_idxs[block_id] = block_idxs[prev_block.id] + block_idx_add
62 | else:
63 | block_idxs[block_id] = block_idxs[next_block.id] + block_idx_add
64 |
65 | page.structure = sorted(page.structure, key=lambda x: block_idxs[x])
66 |
67 |
```
--------------------------------------------------------------------------------
/tests/converters/test_extraction_converter.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | import pytest
3 |
4 | from marker.converters.extraction import ExtractionConverter
5 | from marker.extractors.page import PageExtractionSchema
6 | from marker.extractors.document import DocumentExtractionSchema
7 | from marker.services import BaseService
8 |
9 |
10 | class MockLLMService(BaseService):
11 | def __call__(self, prompt, image=None, page=None, response_schema=None, **kwargs):
12 | if response_schema == PageExtractionSchema:
13 | return {
14 | "description": "Mock extraction description",
15 | "detailed_notes": "Mock detailed notes for page extraction",
16 | }
17 | elif response_schema == DocumentExtractionSchema:
18 | return {
19 | "analysis": "Mock document analysis",
20 | "document_json": json.dumps({"test_key": "test_value"}),
21 | }
22 | return {}
23 |
24 |
25 | @pytest.fixture
26 | def mock_llm_service():
27 | return MockLLMService
28 |
29 |
30 | @pytest.fixture
31 | def extraction_converter(config, model_dict, mock_llm_service):
32 | test_schema = {
33 | "title": "TestSchema",
34 | "type": "object",
35 | "properties": {"test_key": {"title": "Test Key", "type": "string"}},
36 | "required": ["test_key"],
37 | }
38 |
39 | config["page_schema"] = json.dumps(test_schema)
40 | config["output_format"] = "markdown"
41 | model_dict["llm_service"] = mock_llm_service
42 |
43 | converter = ExtractionConverter(
44 | artifact_dict=model_dict, processor_list=None, config=config
45 | )
46 | converter.llm_service = mock_llm_service
47 | converter.default_llm_service = MockLLMService
48 | return converter
49 |
50 |
51 | @pytest.mark.config({"page_range": [0]})
52 | def test_extraction_converter(config, model_dict, mock_llm_service, temp_doc):
53 | config["page_schema"] = "invalid json"
54 |
55 | model_dict["llm_service"] = mock_llm_service
56 | converter = ExtractionConverter(
57 | artifact_dict=model_dict, processor_list=None, config=config
58 | )
59 | converter.artifact_dict["llm_service"] = mock_llm_service()
60 |
61 | results = converter(temp_doc.name)
62 | assert results.document_json == '{"test_key": "test_value"}'
63 |
64 |
65 | @pytest.mark.config({"page_range": [0, 1]})
66 | def test_extraction_converter_multiple_pages(extraction_converter, temp_doc):
67 | result = extraction_converter(temp_doc.name)
68 |
69 | assert result is not None
70 | assert result.document_json is not None
71 | assert json.loads(result.document_json) == {"test_key": "test_value"}
72 | assert result.analysis == "Mock document analysis"
73 |
```
--------------------------------------------------------------------------------
/marker/processors/llm/llm_meta.py:
--------------------------------------------------------------------------------
```python
1 | from concurrent.futures import ThreadPoolExecutor
2 | from typing import List, Dict, Any
3 |
4 | from marker.logger import get_logger
5 | from tqdm import tqdm
6 |
7 | from marker.processors.llm import BaseLLMSimpleBlockProcessor, BaseLLMProcessor
8 | from marker.schema.document import Document
9 | from marker.services import BaseService
10 |
11 | logger = get_logger()
12 |
13 |
14 | class LLMSimpleBlockMetaProcessor(BaseLLMProcessor):
15 | """
16 | A wrapper for simple LLM processors, so they can all run in parallel.
17 | """
18 |
19 | def __init__(
20 | self,
21 | processor_lst: List[BaseLLMSimpleBlockProcessor],
22 | llm_service: BaseService,
23 | config=None,
24 | ):
25 | super().__init__(llm_service, config)
26 | self.processors = processor_lst
27 |
28 | def __call__(self, document: Document):
29 | if not self.use_llm or self.llm_service is None:
30 | return
31 |
32 | total = sum(
33 | [len(processor.inference_blocks(document)) for processor in self.processors]
34 | )
35 | pbar = tqdm(
36 | desc="LLM processors running", disable=self.disable_tqdm, total=total
37 | )
38 |
39 | all_prompts = [
40 | processor.block_prompts(document) for processor in self.processors
41 | ]
42 | pending = []
43 | futures_map = {}
44 | with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
45 | for i, prompt_lst in enumerate(all_prompts):
46 | for prompt in prompt_lst:
47 | future = executor.submit(self.get_response, prompt)
48 | pending.append(future)
49 | futures_map[future] = {"processor_idx": i, "prompt_data": prompt}
50 |
51 | for future in pending:
52 | try:
53 | result = future.result()
54 | future_data = futures_map.pop(future)
55 | processor: BaseLLMSimpleBlockProcessor = self.processors[
56 | future_data["processor_idx"]
57 | ]
58 | # finalize the result
59 | processor(result, future_data["prompt_data"], document)
60 | except Exception as e:
61 | logger.warning(f"Error processing LLM response: {e}")
62 |
63 | pbar.update(1)
64 |
65 | pbar.close()
66 |
67 | def get_response(self, prompt_data: Dict[str, Any]):
68 | return self.llm_service(
69 | prompt_data["prompt"],
70 | prompt_data["image"],
71 | prompt_data["block"],
72 | prompt_data["schema"],
73 | )
74 |
```
--------------------------------------------------------------------------------
/marker/converters/__init__.py:
--------------------------------------------------------------------------------
```python
1 | import inspect
2 | from typing import Optional, List, Type
3 |
4 | from pydantic import BaseModel
5 |
6 | from marker.processors import BaseProcessor
7 | from marker.processors.llm import BaseLLMSimpleBlockProcessor
8 | from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor
9 | from marker.util import assign_config, download_font
10 |
11 |
12 | class BaseConverter:
13 | def __init__(self, config: Optional[BaseModel | dict] = None):
14 | assign_config(self, config)
15 | self.config = config
16 | self.llm_service = None
17 |
18 | # Download render font, needed for some providers
19 | download_font()
20 |
21 | def __call__(self, *args, **kwargs):
22 | raise NotImplementedError
23 |
24 | def resolve_dependencies(self, cls):
25 | init_signature = inspect.signature(cls.__init__)
26 | parameters = init_signature.parameters
27 |
28 | resolved_kwargs = {}
29 | for param_name, param in parameters.items():
30 | if param_name == 'self':
31 | continue
32 | elif param_name == 'config':
33 | resolved_kwargs[param_name] = self.config
34 | elif param.name in self.artifact_dict:
35 | resolved_kwargs[param_name] = self.artifact_dict[param_name]
36 | elif param.default != inspect.Parameter.empty:
37 | resolved_kwargs[param_name] = param.default
38 | else:
39 | raise ValueError(f"Cannot resolve dependency for parameter: {param_name}")
40 |
41 | return cls(**resolved_kwargs)
42 |
43 | def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) -> List[BaseProcessor]:
44 | processors = []
45 | for processor_cls in processor_cls_lst:
46 | processors.append(self.resolve_dependencies(processor_cls))
47 |
48 | simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
49 | other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)]
50 |
51 | if not simple_llm_processors:
52 | return processors
53 |
54 | llm_positions = [i for i, p in enumerate(processors) if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
55 | insert_position = max(0, llm_positions[-1] - len(simple_llm_processors) + 1)
56 |
57 | meta_processor = LLMSimpleBlockMetaProcessor(
58 | processor_lst=simple_llm_processors,
59 | llm_service=self.llm_service,
60 | config=self.config,
61 | )
62 | other_processors.insert(insert_position, meta_processor)
63 | return other_processors
```
--------------------------------------------------------------------------------
/marker/providers/registry.py:
--------------------------------------------------------------------------------
```python
1 | import filetype
2 | import filetype.match as file_match
3 | from bs4 import BeautifulSoup
4 | from filetype.types import archive, document, IMAGE
5 |
6 | from marker.providers.document import DocumentProvider
7 | from marker.providers.epub import EpubProvider
8 | from marker.providers.html import HTMLProvider
9 | from marker.providers.image import ImageProvider
10 | from marker.providers.pdf import PdfProvider
11 | from marker.providers.powerpoint import PowerPointProvider
12 | from marker.providers.spreadsheet import SpreadSheetProvider
13 |
14 | DOCTYPE_MATCHERS = {
15 | "image": IMAGE,
16 | "pdf": [
17 | archive.Pdf,
18 | ],
19 | "epub": [
20 | archive.Epub,
21 | ],
22 | "doc": [document.Docx],
23 | "xls": [document.Xlsx],
24 | "ppt": [document.Pptx],
25 | }
26 |
27 |
28 | def load_matchers(doctype: str):
29 | return [cls() for cls in DOCTYPE_MATCHERS[doctype]]
30 |
31 |
32 | def load_extensions(doctype: str):
33 | return [cls.EXTENSION for cls in DOCTYPE_MATCHERS[doctype]]
34 |
35 |
36 | def provider_from_ext(filepath: str):
37 | ext = filepath.rsplit(".", 1)[-1].strip()
38 | if not ext:
39 | return PdfProvider
40 |
41 | if ext in load_extensions("image"):
42 | return ImageProvider
43 | if ext in load_extensions("pdf"):
44 | return PdfProvider
45 | if ext in load_extensions("doc"):
46 | return DocumentProvider
47 | if ext in load_extensions("xls"):
48 | return SpreadSheetProvider
49 | if ext in load_extensions("ppt"):
50 | return PowerPointProvider
51 | if ext in load_extensions("epub"):
52 | return EpubProvider
53 | if ext in ["html"]:
54 | return HTMLProvider
55 |
56 | return PdfProvider
57 |
58 |
59 | def provider_from_filepath(filepath: str):
60 | if filetype.image_match(filepath) is not None:
61 | return ImageProvider
62 | if file_match(filepath, load_matchers("pdf")) is not None:
63 | return PdfProvider
64 | if file_match(filepath, load_matchers("epub")) is not None:
65 | return EpubProvider
66 | if file_match(filepath, load_matchers("doc")) is not None:
67 | return DocumentProvider
68 | if file_match(filepath, load_matchers("xls")) is not None:
69 | return SpreadSheetProvider
70 | if file_match(filepath, load_matchers("ppt")) is not None:
71 | return PowerPointProvider
72 |
73 | try:
74 | with open(filepath, "r", encoding="utf-8") as f:
75 | soup = BeautifulSoup(f.read(), "html.parser")
76 | # Check if there are any HTML tags
77 | if bool(soup.find()):
78 | return HTMLProvider
79 | except Exception:
80 | pass
81 |
82 | # Fallback if we incorrectly detect the file type
83 | return provider_from_ext(filepath)
84 |
```
--------------------------------------------------------------------------------
/marker/processors/blank_page.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Annotated
2 |
3 | from PIL import Image
4 | import numpy as np
5 | import cv2
6 |
7 | from marker.processors import BaseProcessor
8 | from marker.schema import BlockTypes
9 | from marker.schema.blocks import Block
10 | from marker.schema.document import Document
11 |
12 | from marker.logger import get_logger
13 |
14 | logger = get_logger()
15 |
16 |
17 | class BlankPageProcessor(BaseProcessor):
18 | """
19 | A processor to filter out blank pages detected as a single layout block
20 | """
21 |
22 | full_page_block_intersection_threshold: Annotated[
23 | float, "Threshold to detect blank pages at"
24 | ] = 0.8
25 | filter_blank_pages: Annotated[bool, "Remove blank pages detected as images."] = (
26 | False
27 | )
28 |
29 | def is_blank(self, image: Image.Image):
30 | image = np.asarray(image)
31 | if image.size == 0 or image.shape[0] == 0 or image.shape[1] == 0:
32 | # Handle empty image case
33 | return True
34 |
35 | gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
36 | gray = cv2.GaussianBlur(gray, (7, 7), 0)
37 |
38 | # Adaptive threshold (inverse for text as white)
39 | binarized = cv2.adaptiveThreshold(
40 | gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15
41 | )
42 |
43 | num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
44 | binarized, connectivity=8
45 | )
46 | cleaned = np.zeros_like(binarized)
47 | for i in range(1, num_labels): # skip background
48 | cleaned[labels == i] = 255
49 |
50 | kernel = np.ones((1, 5), np.uint8)
51 | dilated = cv2.dilate(cleaned, kernel, iterations=3)
52 | b = dilated / 255
53 | return b.sum() == 0
54 |
55 | def __call__(self, document: Document):
56 | if not self.filter_blank_pages:
57 | return
58 |
59 | for page in document.pages:
60 | structure_blocks = page.structure_blocks(document)
61 | if not structure_blocks or len(structure_blocks) > 1:
62 | continue
63 |
64 | full_page_block: Block = structure_blocks[0]
65 |
66 | conditions = [
67 | full_page_block.block_type in [BlockTypes.Picture, BlockTypes.Figure],
68 | self.is_blank(full_page_block.get_image(document)),
69 | page.polygon.intersection_area(full_page_block.polygon)
70 | > self.full_page_block_intersection_threshold,
71 | ]
72 |
73 | if all(conditions):
74 | logger.debug(f"Removing blank block {full_page_block.id}")
75 | page.remove_structure_items([full_page_block.id])
76 | full_page_block.removed = True
77 |
```
--------------------------------------------------------------------------------
/marker/processors/util.py:
--------------------------------------------------------------------------------
```python
1 | import re
2 |
3 | from bs4 import BeautifulSoup
4 |
5 | from marker.schema import BlockTypes
6 | from marker.schema.groups import PageGroup
7 | from marker.schema.registry import get_block_class
8 | from marker.schema.text import Line
9 |
10 |
11 | def escape_latex_commands(text: str):
12 | text = (text
13 | .replace('\n', '\\n')
14 | .replace('\t', '\\t')
15 | .replace('\r', '\\r'))
16 | return text
17 |
18 |
19 | def add_math_spans_to_line(corrected_text: str, text_line: Line, page: PageGroup):
20 | SpanClass = get_block_class(BlockTypes.Span)
21 | corrected_spans = text_to_spans(corrected_text)
22 |
23 | for span_idx, span in enumerate(corrected_spans):
24 | if span_idx == len(corrected_spans) - 1:
25 | span['content'] += "\n"
26 |
27 | span_block = page.add_full_block(
28 | SpanClass(
29 | polygon=text_line.polygon,
30 | text=span['content'],
31 | font='Unknown',
32 | font_weight=0,
33 | font_size=0,
34 | minimum_position=0,
35 | maximum_position=0,
36 | formats=[span['type']],
37 | url=span.get('url'),
38 | page_id=text_line.page_id,
39 | text_extraction_method="gemini",
40 | has_superscript=span["has_superscript"],
41 | has_subscript=span["has_subscript"]
42 | )
43 | )
44 | text_line.structure.append(span_block.id)
45 |
46 |
47 | def text_to_spans(text):
48 | soup = BeautifulSoup(text, 'html.parser')
49 |
50 | tag_types = {
51 | 'b': 'bold',
52 | 'i': 'italic',
53 | 'math': 'math',
54 | 'sub': 'plain',
55 | 'sup': 'plain',
56 | 'span': 'plain'
57 | }
58 | spans = []
59 |
60 | for element in soup.descendants:
61 | if not len(list(element.parents)) == 1:
62 | continue
63 |
64 | url = element.attrs.get('href') if hasattr(element, 'attrs') else None
65 |
66 | if element.name in tag_types:
67 | text = element.get_text()
68 | if element.name == "math":
69 | text = escape_latex_commands(text)
70 | spans.append({
71 | 'type': tag_types[element.name],
72 | 'content': text,
73 | 'url': url,
74 | "has_superscript": element.name == "sup",
75 | "has_subscript": element.name == "sub"
76 | })
77 | elif element.string:
78 | spans.append({
79 | 'type': 'plain',
80 | 'content': element.string,
81 | 'url': url,
82 | "has_superscript": False,
83 | "has_subscript": False
84 | })
85 |
86 | return spans
```
--------------------------------------------------------------------------------
/marker/providers/__init__.py:
--------------------------------------------------------------------------------
```python
1 | from copy import deepcopy
2 | from typing import List, Optional, Dict
3 |
4 | from PIL import Image
5 | from pydantic import BaseModel
6 |
7 | from pdftext.schema import Reference
8 |
9 | from marker.logger import configure_logging
10 | from marker.schema.polygon import PolygonBox
11 | from marker.schema.text import Span
12 | from marker.schema.text.char import Char
13 | from marker.schema.text.line import Line
14 | from marker.settings import settings
15 | from marker.util import assign_config
16 |
17 | configure_logging()
18 |
19 |
20 | class ProviderOutput(BaseModel):
21 | line: Line
22 | spans: List[Span]
23 | chars: Optional[List[List[Char]]] = None
24 |
25 | @property
26 | def raw_text(self):
27 | return "".join(span.text for span in self.spans)
28 |
29 | def __hash__(self):
30 | return hash(tuple(self.line.polygon.bbox))
31 |
32 | def merge(self, other: "ProviderOutput"):
33 | new_output = deepcopy(self)
34 | other_copy = deepcopy(other)
35 |
36 | new_output.spans.extend(other_copy.spans)
37 | if new_output.chars is not None and other_copy.chars is not None:
38 | new_output.chars.extend(other_copy.chars)
39 | elif other_copy.chars is not None:
40 | new_output.chars = other_copy.chars
41 |
42 | new_output.line.polygon = new_output.line.polygon.merge(
43 | [other_copy.line.polygon]
44 | )
45 | return new_output
46 |
47 |
48 | ProviderPageLines = Dict[int, List[ProviderOutput]]
49 |
50 |
51 | class BaseProvider:
52 | def __init__(self, filepath: str, config: Optional[BaseModel | dict] = None):
53 | assign_config(self, config)
54 | self.filepath = filepath
55 |
56 | def __len__(self):
57 | pass
58 |
59 | def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
60 | pass
61 |
62 | def get_page_bbox(self, idx: int) -> PolygonBox | None:
63 | pass
64 |
65 | def get_page_lines(self, idx: int) -> List[Line]:
66 | pass
67 |
68 | def get_page_refs(self, idx: int) -> List[Reference]:
69 | pass
70 |
71 | def __enter__(self):
72 | return self
73 |
74 | @staticmethod
75 | def get_font_css():
76 | from weasyprint import CSS
77 | from weasyprint.text.fonts import FontConfiguration
78 |
79 | font_config = FontConfiguration()
80 | css = CSS(
81 | string=f"""
82 | @font-face {{
83 | font-family: GoNotoCurrent-Regular;
84 | src: url({settings.FONT_PATH});
85 | font-display: swap;
86 | }}
87 | body {{
88 | font-family: {settings.FONT_NAME.split(".")[0]}, sans-serif;
89 | font-variant-ligatures: none;
90 | font-feature-settings: "liga" 0;
91 | text-rendering: optimizeLegibility;
92 | }}
93 | """,
94 | font_config=font_config,
95 | )
96 | return css
97 |
```
--------------------------------------------------------------------------------
/marker/providers/document.py:
--------------------------------------------------------------------------------
```python
1 | import base64
2 | import os
3 | import re
4 | import tempfile
5 | from io import BytesIO
6 |
7 | from PIL import Image
8 | from marker.logger import get_logger
9 |
10 | from marker.providers.pdf import PdfProvider
11 |
12 | logger = get_logger()
13 |
14 | css = """
15 | @page {
16 | size: A4;
17 | margin: 2cm;
18 | }
19 |
20 | img {
21 | max-width: 100%;
22 | max-height: 25cm;
23 | object-fit: contain;
24 | margin: 12pt auto;
25 | }
26 |
27 | div, p {
28 | max-width: 100%;
29 | word-break: break-word;
30 | font-size: 10pt;
31 | }
32 |
33 | table {
34 | width: 100%;
35 | border-collapse: collapse;
36 | break-inside: auto;
37 | font-size: 10pt;
38 | }
39 |
40 | tr {
41 | break-inside: avoid;
42 | page-break-inside: avoid;
43 | }
44 |
45 | td {
46 | border: 0.75pt solid #000;
47 | padding: 6pt;
48 | }
49 | """
50 |
51 |
52 | class DocumentProvider(PdfProvider):
53 | def __init__(self, filepath: str, config=None):
54 | temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
55 | self.temp_pdf_path = temp_pdf.name
56 | temp_pdf.close()
57 |
58 | # Convert DOCX to PDF
59 | try:
60 | self.convert_docx_to_pdf(filepath)
61 | except Exception as e:
62 | raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")
63 |
64 | # Initialize the PDF provider with the temp pdf path
65 | super().__init__(self.temp_pdf_path, config)
66 |
67 | def __del__(self):
68 | if os.path.exists(self.temp_pdf_path):
69 | os.remove(self.temp_pdf_path)
70 |
71 | def convert_docx_to_pdf(self, filepath: str):
72 | from weasyprint import CSS, HTML
73 | import mammoth
74 |
75 | with open(filepath, "rb") as docx_file:
76 | # we convert the docx to HTML
77 | result = mammoth.convert_to_html(docx_file)
78 | html = result.value
79 |
80 | # We convert the HTML into a PDF
81 | HTML(string=self._preprocess_base64_images(html)).write_pdf(
82 | self.temp_pdf_path, stylesheets=[CSS(string=css), self.get_font_css()]
83 | )
84 |
85 | @staticmethod
86 | def _preprocess_base64_images(html_content):
87 | pattern = r'data:([^;]+);base64,([^"\'>\s]+)'
88 |
89 | def convert_image(match):
90 | try:
91 | img_data = base64.b64decode(match.group(2))
92 |
93 | with BytesIO(img_data) as bio:
94 | with Image.open(bio) as img:
95 | output = BytesIO()
96 | img.save(output, format=img.format)
97 | new_base64 = base64.b64encode(output.getvalue()).decode()
98 | return f"data:{match.group(1)};base64,{new_base64}"
99 |
100 | except Exception as e:
101 | logger.error(f"Failed to process image: {e}")
102 | return "" # we ditch broken images as that breaks the PDF creation down the line
103 |
104 | return re.sub(pattern, convert_image, html_content)
105 |
```
--------------------------------------------------------------------------------
/benchmarks/overall/methods/olmocr.py:
--------------------------------------------------------------------------------
```python
1 | import base64
2 | import json
3 | import tempfile
4 | import time
5 | from io import BytesIO
6 |
7 | import torch
8 | from PIL import Image
9 |
10 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
11 |
12 |
13 | def convert_single_page(filename: str, model, processor, device):
14 | from olmocr.data.renderpdf import render_pdf_to_base64png
15 | from olmocr.prompts import build_finetuning_prompt
16 | from olmocr.prompts.anchor import get_anchor_text
17 |
18 | image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1024)
19 |
20 | # Build the prompt, using document metadata
21 | anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=4000)
22 | prompt = build_finetuning_prompt(anchor_text)
23 |
24 | # Build the full prompt
25 | messages = [
26 | {
27 | "role": "user",
28 | "content": [
29 | {"type": "text", "text": prompt},
30 | {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
31 | ],
32 | }
33 | ]
34 |
35 | # Apply the chat template and processor
36 | text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
37 | main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
38 |
39 | inputs = processor(
40 | text=[text],
41 | images=[main_image],
42 | padding=True,
43 | return_tensors="pt",
44 | )
45 | inputs = {key: value.to(device) for (key, value) in inputs.items()}
46 |
47 | # Generate the output
48 | output = model.generate(
49 | **inputs,
50 | temperature=0.8,
51 | max_new_tokens=8192,
52 | num_return_sequences=1,
53 | do_sample=True,
54 | )
55 |
56 | # Decode the output
57 | prompt_length = inputs["input_ids"].shape[1]
58 | new_tokens = output[:, prompt_length:]
59 | text_output = processor.tokenizer.batch_decode(
60 | new_tokens, skip_special_tokens=True
61 | )[0]
62 |
63 | try:
64 | text_output = json.loads(text_output)
65 | text = text_output["natural_text"]
66 | except Exception:
67 | try:
68 | text = text_output.split("natural_text")[1].strip()
69 | except Exception:
70 | text = ""
71 |
72 | return text
73 |
74 |
75 | class OlmOCRMethod(BaseMethod):
76 | olmocr_model: dict = None
77 | use_llm: bool = False
78 |
79 | def __call__(self, sample) -> BenchmarkResult:
80 | pdf_bytes = sample["pdf"] # This is a single page PDF
81 |
82 | with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
83 | f.write(pdf_bytes)
84 | start = time.time()
85 | result = convert_single_page(f.name, self.olmocr_model["model"], self.olmocr_model["processor"], self.olmocr_model["model"].device)
86 | total = time.time() - start
87 |
88 | return {
89 | "markdown": result,
90 | "time": total
91 | }
92 |
```
--------------------------------------------------------------------------------
/tests/renderers/test_markdown_renderer.py:
--------------------------------------------------------------------------------
```python
1 | import pytest
2 |
3 | from marker.renderers.markdown import MarkdownRenderer
4 | from marker.schema import BlockTypes
5 | from marker.schema.blocks import TableCell
6 |
7 |
8 | @pytest.mark.config({"page_range": [0], "disable_ocr": True})
9 | def test_markdown_renderer(pdf_document):
10 | renderer = MarkdownRenderer()
11 | md = renderer(pdf_document).markdown
12 |
13 | # Verify markdown
14 | assert "# Subspace Adversarial Training" in md
15 |
16 |
17 | @pytest.mark.config({"page_range": [0]})
18 | def test_markdown_renderer_auto_ocr(pdf_document):
19 | renderer = MarkdownRenderer()
20 | md = renderer(pdf_document).markdown
21 |
22 | # Verify markdown
23 | assert "Subspace Adversarial Training" in md
24 |
25 |
26 | @pytest.mark.config({"page_range": [0, 1], "paginate_output": True})
27 | def test_markdown_renderer_pagination(pdf_document):
28 | renderer = MarkdownRenderer({"paginate_output": True})
29 | md = renderer(pdf_document).markdown
30 |
31 | assert "\n\n{0}-" in md
32 | assert "\n\n{1}-" in md
33 |
34 |
35 | @pytest.mark.config({"page_range": [0, 1], "paginate_output": True})
36 | def test_markdown_renderer_pagination_blank_last_page(pdf_document):
37 | # Clear all children and structure from the last page to simulate a blank page
38 | last_page = pdf_document.pages[-1]
39 | last_page.children = []
40 | last_page.structure = []
41 |
42 | renderer = MarkdownRenderer({"paginate_output": True})
43 | md = renderer(pdf_document).markdown
44 |
45 | # Should end with pagination marker and preserve trailing newlines
46 | assert md.endswith("}\n\n") or md.endswith(
47 | "}------------------------------------------------\n\n"
48 | )
49 |
50 |
51 | @pytest.mark.config({"page_range": [0, 1]})
52 | def test_markdown_renderer_metadata(pdf_document):
53 | renderer = MarkdownRenderer({"paginate_output": True})
54 | metadata = renderer(pdf_document).metadata
55 | assert "table_of_contents" in metadata
56 |
57 |
58 | @pytest.mark.config({"page_range": [0, 1]})
59 | def test_markdown_renderer_images(pdf_document):
60 | renderer = MarkdownRenderer({"extract_images": False})
61 | markdown_output = renderer(pdf_document)
62 |
63 | assert len(markdown_output.images) == 0
64 | assert "
68 | def test_markdown_renderer_tables(pdf_document):
69 | table = pdf_document.contained_blocks((BlockTypes.Table,))[0]
70 | page = pdf_document.pages[0]
71 |
72 | cell = TableCell(
73 | polygon=table.polygon,
74 | text_lines=["54<i>.45</i>67<br>89<math>x</math>"],
75 | rowspan=1,
76 | colspan=1,
77 | row_id=0,
78 | col_id=0,
79 | is_header=False,
80 | page_id=page.page_id,
81 | )
82 | page.add_full_block(cell)
83 | table.structure = []
84 | table.add_structure(cell)
85 |
86 | renderer = MarkdownRenderer()
87 | md = renderer(pdf_document).markdown
88 | assert "54 <i>.45</i> 67<br>89 $x$" in md
89 |
```
--------------------------------------------------------------------------------
/marker/schema/registry.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Dict, Type
2 | from importlib import import_module
3 |
4 | from marker.schema import BlockTypes
5 | from marker.schema.blocks import (
6 | Block,
7 | Caption,
8 | Code,
9 | Equation,
10 | Figure,
11 | Footnote,
12 | Form,
13 | Handwriting,
14 | InlineMath,
15 | ListItem,
16 | PageFooter,
17 | PageHeader,
18 | Picture,
19 | SectionHeader,
20 | Table,
21 | TableOfContents,
22 | Text,
23 | ComplexRegion,
24 | TableCell,
25 | Reference,
26 | )
27 | from marker.schema.document import Document
28 | from marker.schema.groups import (
29 | FigureGroup,
30 | ListGroup,
31 | PageGroup,
32 | PictureGroup,
33 | TableGroup,
34 | )
35 | from marker.schema.text import Line, Span
36 | from marker.schema.text.char import Char
37 |
38 | BLOCK_REGISTRY: Dict[BlockTypes, str] = {}
39 |
40 |
41 | def register_block_class(block_type: BlockTypes, block_cls: Type[Block]):
42 | BLOCK_REGISTRY[block_type] = f"{block_cls.__module__}.{block_cls.__name__}"
43 |
44 |
45 | def get_block_class(block_type: BlockTypes) -> Type[Block]:
46 | class_path = BLOCK_REGISTRY[block_type]
47 | module_name, class_name = class_path.rsplit(".", 1)
48 | module = import_module(module_name)
49 | return getattr(module, class_name)
50 |
51 |
52 | register_block_class(BlockTypes.Line, Line)
53 | register_block_class(BlockTypes.Span, Span)
54 | register_block_class(BlockTypes.Char, Char)
55 | register_block_class(BlockTypes.FigureGroup, FigureGroup)
56 | register_block_class(BlockTypes.TableGroup, TableGroup)
57 | register_block_class(BlockTypes.ListGroup, ListGroup)
58 | register_block_class(BlockTypes.PictureGroup, PictureGroup)
59 | register_block_class(BlockTypes.Page, PageGroup)
60 | register_block_class(BlockTypes.Caption, Caption)
61 | register_block_class(BlockTypes.Code, Code)
62 | register_block_class(BlockTypes.Figure, Figure)
63 | register_block_class(BlockTypes.Footnote, Footnote)
64 | register_block_class(BlockTypes.Form, Form)
65 | register_block_class(BlockTypes.Equation, Equation)
66 | register_block_class(BlockTypes.Handwriting, Handwriting)
67 | register_block_class(BlockTypes.TextInlineMath, InlineMath)
68 | register_block_class(BlockTypes.ListItem, ListItem)
69 | register_block_class(BlockTypes.PageFooter, PageFooter)
70 | register_block_class(BlockTypes.PageHeader, PageHeader)
71 | register_block_class(BlockTypes.Picture, Picture)
72 | register_block_class(BlockTypes.SectionHeader, SectionHeader)
73 | register_block_class(BlockTypes.Table, Table)
74 | register_block_class(BlockTypes.Text, Text)
75 | register_block_class(BlockTypes.TableOfContents, TableOfContents)
76 | register_block_class(BlockTypes.ComplexRegion, ComplexRegion)
77 | register_block_class(BlockTypes.TableCell, TableCell)
78 | register_block_class(BlockTypes.Reference, Reference)
79 | register_block_class(BlockTypes.Document, Document)
80 |
81 | assert len(BLOCK_REGISTRY) == len(BlockTypes)
82 | assert all(
83 | [
84 | get_block_class(k).model_fields["block_type"].default == k
85 | for k, _ in BLOCK_REGISTRY.items()
86 | ]
87 | )
88 |
```
--------------------------------------------------------------------------------
/marker/processors/blockquote.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Annotated, Tuple
2 |
3 | from marker.processors import BaseProcessor
4 | from marker.schema import BlockTypes
5 | from marker.schema.document import Document
6 |
7 |
8 | class BlockquoteProcessor(BaseProcessor):
9 | """
10 | A processor for tagging blockquotes.
11 | """
12 | block_types: Annotated[
13 | Tuple[BlockTypes],
14 | "The block types to process.",
15 | ] = (BlockTypes.Text, BlockTypes.TextInlineMath)
16 | min_x_indent: Annotated[
17 | float,
18 | "The minimum horizontal indentation required to consider a block as part of a blockquote.",
19 | "Expressed as a percentage of the block width.",
20 | ] = 0.1
21 | x_start_tolerance: Annotated[
22 | float,
23 | "The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.",
24 | "Expressed as a percentage of the block width.",
25 | ] = 0.01
26 | x_end_tolerance: Annotated[
27 | float,
28 | "The maximum allowable difference between the ending x-coordinates of consecutive blocks to consider them aligned.",
29 | "Expressed as a percentage of the block width.",
30 | ] = 0.01
31 |
32 | def __init__(self, config):
33 | super().__init__(config)
34 |
35 | def __call__(self, document: Document):
36 | for page in document.pages:
37 | for block in page.contained_blocks(document, self.block_types):
38 | if block.structure is None:
39 | continue
40 |
41 | if not len(block.structure) >= 2:
42 | continue
43 |
44 | next_block = page.get_next_block(block)
45 | if next_block is None:
46 | continue
47 | if next_block.block_type not in self.block_types:
48 | continue
49 | if next_block.structure is None:
50 | continue
51 | if next_block.ignore_for_output:
52 | continue
53 |
54 | matching_x_end = abs(next_block.polygon.x_end - block.polygon.x_end) < self.x_end_tolerance * block.polygon.width
55 | matching_x_start = abs(next_block.polygon.x_start - block.polygon.x_start) < self.x_start_tolerance * block.polygon.width
56 | x_indent = next_block.polygon.x_start > block.polygon.x_start + (self.min_x_indent * block.polygon.width)
57 | y_indent = next_block.polygon.y_start > block.polygon.y_end
58 |
59 | if block.blockquote:
60 | next_block.blockquote = (matching_x_end and matching_x_start) or (x_indent and y_indent)
61 | next_block.blockquote_level = block.blockquote_level
62 | if (x_indent and y_indent):
63 | next_block.blockquote_level += 1
64 | elif len(next_block.structure) >= 2 and (x_indent and y_indent):
65 | next_block.blockquote = True
66 | next_block.blockquote_level = 1
```
--------------------------------------------------------------------------------
/tests/services/test_service_init.py:
--------------------------------------------------------------------------------
```python
1 | import pytest
2 |
3 | from marker.converters.pdf import PdfConverter
4 | from marker.services.gemini import GoogleGeminiService
5 | from marker.services.ollama import OllamaService
6 | from marker.services.vertex import GoogleVertexService
7 | from marker.services.openai import OpenAIService
8 | from marker.services.azure_openai import AzureOpenAIService
9 |
10 |
11 | @pytest.mark.output_format("markdown")
12 | @pytest.mark.config({"page_range": [0]})
13 | def test_empty_llm(pdf_converter: PdfConverter, temp_doc):
14 | assert pdf_converter.artifact_dict["llm_service"] is None
15 | assert pdf_converter.llm_service is None
16 |
17 |
18 | def test_llm_no_keys(model_dict, config):
19 | with pytest.raises(AssertionError):
20 | PdfConverter(artifact_dict=model_dict, config={"use_llm": True})
21 |
22 |
23 | @pytest.mark.output_format("markdown")
24 | @pytest.mark.config({"page_range": [0], "use_llm": True, "gemini_api_key": "test"})
25 | def test_llm_gemini(pdf_converter: PdfConverter, temp_doc):
26 | assert pdf_converter.artifact_dict["llm_service"] is not None
27 | assert isinstance(pdf_converter.llm_service, GoogleGeminiService)
28 |
29 |
30 | @pytest.mark.output_format("markdown")
31 | @pytest.mark.config(
32 | {
33 | "page_range": [0],
34 | "use_llm": True,
35 | "vertex_project_id": "test",
36 | "llm_service": "marker.services.vertex.GoogleVertexService",
37 | }
38 | )
39 | def test_llm_vertex(pdf_converter: PdfConverter, temp_doc):
40 | assert pdf_converter.artifact_dict["llm_service"] is not None
41 | assert isinstance(pdf_converter.llm_service, GoogleVertexService)
42 |
43 |
44 | @pytest.mark.output_format("markdown")
45 | @pytest.mark.config(
46 | {
47 | "page_range": [0],
48 | "use_llm": True,
49 | "llm_service": "marker.services.ollama.OllamaService",
50 | }
51 | )
52 | def test_llm_ollama(pdf_converter: PdfConverter, temp_doc):
53 | assert pdf_converter.artifact_dict["llm_service"] is not None
54 | assert isinstance(pdf_converter.llm_service, OllamaService)
55 |
56 |
57 | @pytest.mark.output_format("markdown")
58 | @pytest.mark.config(
59 | {
60 | "page_range": [0],
61 | "use_llm": True,
62 | "llm_service": "marker.services.openai.OpenAIService",
63 | "openai_api_key": "test",
64 | }
65 | )
66 | def test_llm_openai(pdf_converter: PdfConverter, temp_doc):
67 | assert pdf_converter.artifact_dict["llm_service"] is not None
68 | assert isinstance(pdf_converter.llm_service, OpenAIService)
69 |
70 |
71 | @pytest.mark.output_format("markdown")
72 | @pytest.mark.config(
73 | {
74 | "page_range": [0],
75 | "use_llm": True,
76 | "llm_service": "marker.services.azure_openai.AzureOpenAIService",
77 | "azure_endpoint": "https://example.openai.azure.com",
78 | "azure_api_key": "test",
79 | "deployment_name": "test-model",
80 | "azure_api_version": "1",
81 | }
82 | )
83 | def test_llm_azure_openai(pdf_converter: PdfConverter, temp_doc):
84 | assert pdf_converter.artifact_dict["llm_service"] is not None
85 | assert isinstance(pdf_converter.llm_service, AzureOpenAIService)
86 |
```
--------------------------------------------------------------------------------
/marker/converters/extraction.py:
--------------------------------------------------------------------------------
```python
1 | import re
2 | from typing import Annotated
3 |
4 | from marker.builders.document import DocumentBuilder
5 | from marker.builders.line import LineBuilder
6 | from marker.builders.ocr import OcrBuilder
7 | from marker.builders.structure import StructureBuilder
8 | from marker.converters.pdf import PdfConverter
9 | from marker.extractors.document import DocumentExtractor
10 | from marker.extractors.page import PageExtractor
11 | from marker.providers.registry import provider_from_filepath
12 |
13 | from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput
14 | from marker.renderers.markdown import MarkdownRenderer
15 |
16 | from marker.logger import get_logger
17 |
18 | logger = get_logger()
19 |
20 |
21 | class ExtractionConverter(PdfConverter):
22 | pattern: str = r"{\d+\}-{48}\n\n"
23 | existing_markdown: Annotated[
24 | str, "Markdown that was already converted for extraction."
25 | ] = None
26 |
27 | def build_document(self, filepath: str):
28 | provider_cls = provider_from_filepath(filepath)
29 | layout_builder = self.resolve_dependencies(self.layout_builder_class)
30 | line_builder = self.resolve_dependencies(LineBuilder)
31 | ocr_builder = self.resolve_dependencies(OcrBuilder)
32 | provider = provider_cls(filepath, self.config)
33 | document = DocumentBuilder(self.config)(
34 | provider, layout_builder, line_builder, ocr_builder
35 | )
36 | structure_builder_cls = self.resolve_dependencies(StructureBuilder)
37 | structure_builder_cls(document)
38 |
39 | for processor in self.processor_list:
40 | processor(document)
41 |
42 | return document, provider
43 |
44 | def __call__(self, filepath: str) -> ExtractionOutput:
45 | self.config["paginate_output"] = True # Ensure we can split the output properly
46 | self.config["output_format"] = (
47 | "markdown" # Output must be markdown for extraction
48 | )
49 | markdown = self.existing_markdown
50 |
51 | if not markdown:
52 | document, provider = self.build_document(filepath)
53 | self.page_count = len(document.pages)
54 | renderer = self.resolve_dependencies(MarkdownRenderer)
55 | output = renderer(document)
56 | markdown = output.markdown
57 |
58 | output_pages = re.split(self.pattern, markdown)[1:] # Split output into pages
59 |
60 | # This needs an LLM service for extraction, this sets it in the extractor
61 | if self.artifact_dict.get("llm_service") is None:
62 | self.artifact_dict["llm_service"] = self.resolve_dependencies(
63 | self.default_llm_service
64 | )
65 |
66 | page_extractor = self.resolve_dependencies(PageExtractor)
67 | document_extractor = self.resolve_dependencies(DocumentExtractor)
68 | renderer = self.resolve_dependencies(ExtractionRenderer)
69 |
70 | # Inference in parallel
71 | notes = page_extractor(output_pages)
72 | document_output = document_extractor(notes)
73 |
74 | merged = renderer(document_output, markdown)
75 | return merged
76 |
```
--------------------------------------------------------------------------------
/marker/providers/epub.py:
--------------------------------------------------------------------------------
```python
1 | import base64
2 | import os
3 | import tempfile
4 |
5 | from bs4 import BeautifulSoup
6 |
7 | from marker.providers.pdf import PdfProvider
8 |
9 | css = '''
10 | @page {
11 | size: A4;
12 | margin: 2cm;
13 | }
14 |
15 | img {
16 | max-width: 100%;
17 | max-height: 25cm;
18 | object-fit: contain;
19 | margin: 12pt auto;
20 | }
21 |
22 | div, p {
23 | max-width: 100%;
24 | word-break: break-word;
25 | font-size: 10pt;
26 | }
27 |
28 | table {
29 | width: 100%;
30 | border-collapse: collapse;
31 | break-inside: auto;
32 | font-size: 10pt;
33 | }
34 |
35 | tr {
36 | break-inside: avoid;
37 | page-break-inside: avoid;
38 | }
39 |
40 | td {
41 | border: 0.75pt solid #000;
42 | padding: 6pt;
43 | }
44 | '''
45 |
46 |
47 | class EpubProvider(PdfProvider):
48 | def __init__(self, filepath: str, config=None):
49 | temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=f".pdf")
50 | self.temp_pdf_path = temp_pdf.name
51 | temp_pdf.close()
52 |
53 | # Convert Epub to PDF
54 | try:
55 | self.convert_epub_to_pdf(filepath)
56 | except Exception as e:
57 | raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")
58 |
59 | # Initialize the PDF provider with the temp pdf path
60 | super().__init__(self.temp_pdf_path, config)
61 |
62 | def __del__(self):
63 | if os.path.exists(self.temp_pdf_path):
64 | os.remove(self.temp_pdf_path)
65 |
66 | def convert_epub_to_pdf(self, filepath):
67 | from weasyprint import CSS, HTML
68 | from ebooklib import epub
69 | import ebooklib
70 |
71 | ebook = epub.read_epub(filepath)
72 |
73 | styles = []
74 | html_content = ""
75 | img_tags = {}
76 |
77 | for item in ebook.get_items():
78 | if item.get_type() == ebooklib.ITEM_IMAGE:
79 | img_data = base64.b64encode(item.get_content()).decode("utf-8")
80 | img_tags[item.file_name] = f'data:{item.media_type};base64,{img_data}'
81 | elif item.get_type() == ebooklib.ITEM_STYLE:
82 | styles.append(item.get_content().decode('utf-8'))
83 |
84 | for item in ebook.get_items():
85 | if item.get_type() == ebooklib.ITEM_DOCUMENT:
86 | html_content += item.get_content().decode("utf-8")
87 |
88 | soup = BeautifulSoup(html_content, 'html.parser')
89 | for img in soup.find_all('img'):
90 | src = img.get('src')
91 | if src:
92 | normalized_src = src.replace('../', '')
93 | if normalized_src in img_tags:
94 | img['src'] = img_tags[normalized_src]
95 |
96 | for image in soup.find_all('image'):
97 | src = image.get('xlink:href')
98 | if src:
99 | normalized_src = src.replace('../', '')
100 | if normalized_src in img_tags:
101 | image['xlink:href'] = img_tags[normalized_src]
102 |
103 | html_content = str(soup)
104 | full_style = ''.join([css]) # + styles)
105 |
106 | # we convert the epub to HTML
107 | HTML(string=html_content, base_url=filepath).write_pdf(
108 | self.temp_pdf_path,
109 | stylesheets=[CSS(string=full_style), self.get_font_css()]
110 | )
111 |
```
--------------------------------------------------------------------------------
/tests/processors/test_table_processor.py:
--------------------------------------------------------------------------------
```python
1 | from typing import List
2 |
3 | import pytest
4 |
5 | from marker.renderers.markdown import MarkdownRenderer
6 | from marker.schema import BlockTypes
7 | from marker.processors.table import TableProcessor
8 | from marker.schema.blocks import TableCell
9 |
10 |
11 | @pytest.mark.config({"page_range": [5]})
12 | def test_table_processor(
13 | pdf_document, recognition_model, table_rec_model, detection_model
14 | ):
15 | processor = TableProcessor(recognition_model, table_rec_model, detection_model)
16 | processor(pdf_document)
17 |
18 | for block in pdf_document.pages[0].children:
19 | if block.block_type == BlockTypes.Table:
20 | children = block.contained_blocks(pdf_document, (BlockTypes.TableCell,))
21 | assert children
22 | assert len(children) > 0
23 | assert isinstance(children[0], TableCell)
24 |
25 | assert len(pdf_document.contained_blocks((BlockTypes.Table,))) == 2
26 |
27 | renderer = MarkdownRenderer()
28 | table_output = renderer(pdf_document)
29 | assert "Schedule" in table_output.markdown
30 |
31 |
32 | @pytest.mark.filename("table_ex.pdf")
33 | @pytest.mark.config({"page_range": [0], "force_ocr": True})
34 | def test_avoid_double_ocr(
35 | pdf_document, recognition_model, table_rec_model, detection_model
36 | ):
37 | tables = pdf_document.contained_blocks((BlockTypes.Table,))
38 | lines = tables[0].contained_blocks(pdf_document, (BlockTypes.Line,))
39 | assert len(lines) == 0
40 |
41 | processor = TableProcessor(
42 | recognition_model, table_rec_model, detection_model, config={"force_ocr": True}
43 | )
44 | processor(pdf_document)
45 |
46 | renderer = MarkdownRenderer()
47 | table_output = renderer(pdf_document)
48 | assert "Participants" in table_output.markdown
49 |
50 |
51 | @pytest.mark.filename("multicol-blocks.pdf")
52 | @pytest.mark.config({"page_range": [3]})
53 | def test_overlap_blocks(
54 | pdf_document, detection_model, recognition_model, table_rec_model
55 | ):
56 | page = pdf_document.pages[0]
57 | assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(
58 | pdf_document
59 | )
60 |
61 | processor = TableProcessor(recognition_model, table_rec_model, detection_model)
62 | processor(pdf_document)
63 |
64 | assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(
65 | pdf_document
66 | )
67 |
68 |
69 | @pytest.mark.filename("pres.pdf")
70 | @pytest.mark.config({"page_range": [4]})
71 | def test_ocr_table(pdf_document, recognition_model, table_rec_model, detection_model):
72 | processor = TableProcessor(recognition_model, table_rec_model, detection_model)
73 | processor(pdf_document)
74 |
75 | renderer = MarkdownRenderer()
76 | table_output = renderer(pdf_document)
77 | assert "1.2E-38" in table_output.markdown
78 |
79 |
80 | @pytest.mark.config({"page_range": [11]})
81 | def test_split_rows(pdf_document, recognition_model, table_rec_model, detection_model):
82 | processor = TableProcessor(recognition_model, table_rec_model, detection_model)
83 | processor(pdf_document)
84 |
85 | table = pdf_document.contained_blocks((BlockTypes.Table,))[-1]
86 | cells: List[TableCell] = table.contained_blocks(
87 | pdf_document, (BlockTypes.TableCell,)
88 | )
89 | unique_rows = len(set([cell.row_id for cell in cells]))
90 | assert unique_rows == 6
91 |
```
--------------------------------------------------------------------------------
/marker/renderers/json.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Annotated, Dict, List, Tuple
2 |
3 | from pydantic import BaseModel
4 |
5 | from marker.renderers import BaseRenderer
6 | from marker.schema import BlockTypes
7 | from marker.schema.blocks import Block, BlockOutput
8 | from marker.schema.document import Document
9 | from marker.schema.registry import get_block_class
10 |
11 |
12 | class JSONBlockOutput(BaseModel):
13 | id: str
14 | block_type: str
15 | html: str
16 | polygon: List[List[float]]
17 | bbox: List[float]
18 | children: List["JSONBlockOutput"] | None = None
19 | section_hierarchy: Dict[int, str] | None = None
20 | images: dict | None = None
21 |
22 |
23 | class JSONOutput(BaseModel):
24 | children: List[JSONBlockOutput]
25 | block_type: str = str(BlockTypes.Document)
26 | metadata: dict
27 |
28 |
29 | def reformat_section_hierarchy(section_hierarchy):
30 | new_section_hierarchy = {}
31 | for key, value in section_hierarchy.items():
32 | new_section_hierarchy[key] = str(value)
33 | return new_section_hierarchy
34 |
35 |
36 | class JSONRenderer(BaseRenderer):
37 | """
38 | A renderer for JSON output.
39 | """
40 |
41 | image_blocks: Annotated[
42 | Tuple[BlockTypes],
43 | "The list of block types to consider as images.",
44 | ] = (BlockTypes.Picture, BlockTypes.Figure)
45 | page_blocks: Annotated[
46 | Tuple[BlockTypes],
47 | "The list of block types to consider as pages.",
48 | ] = (BlockTypes.Page,)
49 |
50 | def extract_json(self, document: Document, block_output: BlockOutput):
51 | cls = get_block_class(block_output.id.block_type)
52 | if cls.__base__ == Block:
53 | html, images = self.extract_block_html(document, block_output)
54 | return JSONBlockOutput(
55 | html=html,
56 | polygon=block_output.polygon.polygon,
57 | bbox=block_output.polygon.bbox,
58 | id=str(block_output.id),
59 | block_type=str(block_output.id.block_type),
60 | images=images,
61 | section_hierarchy=reformat_section_hierarchy(
62 | block_output.section_hierarchy
63 | ),
64 | )
65 | else:
66 | children = []
67 | for child in block_output.children:
68 | child_output = self.extract_json(document, child)
69 | children.append(child_output)
70 |
71 | return JSONBlockOutput(
72 | html=block_output.html,
73 | polygon=block_output.polygon.polygon,
74 | bbox=block_output.polygon.bbox,
75 | id=str(block_output.id),
76 | block_type=str(block_output.id.block_type),
77 | children=children,
78 | section_hierarchy=reformat_section_hierarchy(
79 | block_output.section_hierarchy
80 | ),
81 | )
82 |
83 | def __call__(self, document: Document) -> JSONOutput:
84 | document_output = document.render(self.block_config)
85 | json_output = []
86 | for page_output in document_output.children:
87 | json_output.append(self.extract_json(document, page_output))
88 | return JSONOutput(
89 | children=json_output,
90 | metadata=self.generate_document_metadata(document, document_output),
91 | )
92 |
```
--------------------------------------------------------------------------------
/marker/processors/llm/llm_image_description.py:
--------------------------------------------------------------------------------
```python
1 | from pydantic import BaseModel
2 |
3 | from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData
4 |
5 | from marker.schema import BlockTypes
6 | from marker.schema.document import Document
7 |
8 | from typing import Annotated, List
9 |
10 |
11 | class LLMImageDescriptionProcessor(BaseLLMSimpleBlockProcessor):
12 | block_types = (
13 | BlockTypes.Picture,
14 | BlockTypes.Figure,
15 | )
16 | extract_images: Annotated[bool, "Extract images from the document."] = True
17 | image_description_prompt: Annotated[
18 | str,
19 | "The prompt to use for generating image descriptions.",
20 | "Default is a string containing the Gemini prompt.",
21 | ] = """You are a document analysis expert who specializes in creating text descriptions for images.
22 | You will receive an image of a picture or figure. Your job will be to create a short description of the image.
23 | **Instructions:**
24 | 1. Carefully examine the provided image.
25 | 2. Analyze any text that was extracted from within the image.
26 | 3. Output a faithful description of the image. Make sure there is enough specific detail to accurately reconstruct the image. If the image is a figure or contains numeric data, include the numeric data in the output.
27 | **Example:**
28 | Input:
29 | ```text
30 | "Fruit Preference Survey"
31 | 20, 15, 10
32 | Apples, Bananas, Oranges
33 | ```
34 | Output:
35 | In this figure, a bar chart titled "Fruit Preference Survey" is showing the number of people who prefer different types of fruits. The x-axis shows the types of fruits, and the y-axis shows the number of people. The bar chart shows that most people prefer apples, followed by bananas and oranges. 20 people prefer apples, 15 people prefer bananas, and 10 people prefer oranges.
36 | **Input:**
37 | ```text
38 | {raw_text}
39 | ```
40 | """
41 |
42 | def inference_blocks(self, document: Document) -> List[BlockData]:
43 | blocks = super().inference_blocks(document)
44 | if self.extract_images:
45 | return []
46 | return blocks
47 |
48 | def block_prompts(self, document: Document) -> List[PromptData]:
49 | prompt_data = []
50 | for block_data in self.inference_blocks(document):
51 | block = block_data["block"]
52 | prompt = self.image_description_prompt.replace(
53 | "{raw_text}", block.raw_text(document)
54 | )
55 | image = self.extract_image(document, block)
56 |
57 | prompt_data.append(
58 | {
59 | "prompt": prompt,
60 | "image": image,
61 | "block": block,
62 | "schema": ImageSchema,
63 | "page": block_data["page"],
64 | }
65 | )
66 |
67 | return prompt_data
68 |
69 | def rewrite_block(
70 | self, response: dict, prompt_data: PromptData, document: Document
71 | ):
72 | block = prompt_data["block"]
73 |
74 | if not response or "image_description" not in response:
75 | block.update_metadata(llm_error_count=1)
76 | return
77 |
78 | image_description = response["image_description"]
79 | if len(image_description) < 10:
80 | block.update_metadata(llm_error_count=1)
81 | return
82 |
83 | block.description = image_description
84 |
85 |
86 | class ImageSchema(BaseModel):
87 | image_description: str
88 |
```
--------------------------------------------------------------------------------
/benchmarks/overall/display/table.py:
--------------------------------------------------------------------------------
```python
1 | from pathlib import Path
2 | from typing import Dict, List
3 |
4 | import tabulate
5 |
6 | from benchmarks.overall.schema import FullResult
7 |
8 | def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
9 | table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
10 | with open(out_path / filename, "w", encoding="utf-8") as f:
11 | f.write(f"# {title}\n")
12 | f.write(table)
13 | print(title)
14 | print(table)
15 |
16 |
17 | def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
18 | document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
19 | headers = ["Document Type"]
20 | for method in methods:
21 | for score_type in score_types:
22 | headers.append(f"{method} {score_type}")
23 |
24 | document_rows = [[k] for k in document_types]
25 | for i, doc_type in enumerate(document_types):
26 | for method in methods:
27 | for score_type in score_types:
28 | avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
29 | document_rows[i].append(avg_score)
30 |
31 | write_table("Document Types", document_rows, headers, out_path, "document_types.md")
32 |
33 | headers = ["Block Type"]
34 | block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
35 | block_score_types = list(result["averages_by_block_type"][default_method].keys())
36 | for method in methods:
37 | for score_type in block_score_types:
38 | headers.append(f"{method} {score_type}")
39 |
40 | block_rows = [[k] for k in block_types]
41 | for i, block_type in enumerate(block_types):
42 | for method in methods:
43 | for score_type in block_score_types:
44 | avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
45 | block_rows[i].append(avg_score)
46 |
47 | write_table("Block types", block_rows, headers, out_path, "block_types.md")
48 |
49 | headers = ["Method", "Avg Time"] + score_types
50 | inference_rows = [[k] for k in methods]
51 | all_raw_scores = [result["scores"][i] for i in result["scores"]]
52 | for i, method in enumerate(methods):
53 | avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
54 | inference_rows[i].append(avg_time)
55 | for score_type in score_types:
56 | scores_lst = []
57 | for ar in all_raw_scores:
58 | try:
59 | # Sometimes a few llm scores are missing
60 | scores_lst.append(ar[method][score_type]["score"])
61 | except KeyError:
62 | continue
63 | avg_score = sum(scores_lst) / max(1, len(scores_lst))
64 | inference_rows[i].append(avg_score)
65 |
66 | write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
67 |
68 | print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")
```
--------------------------------------------------------------------------------
/marker/renderers/chunk.py:
--------------------------------------------------------------------------------
```python
1 | import html
2 | from typing import List, Dict
3 |
4 | from bs4 import BeautifulSoup
5 | from pydantic import BaseModel
6 |
7 | from marker.renderers.json import JSONRenderer, JSONBlockOutput
8 | from marker.schema.document import Document
9 |
10 |
11 | class FlatBlockOutput(BaseModel):
12 | id: str
13 | block_type: str
14 | html: str
15 | page: int
16 | polygon: List[List[float]]
17 | bbox: List[float]
18 | section_hierarchy: Dict[int, str] | None = None
19 | images: dict | None = None
20 |
21 |
22 | class ChunkOutput(BaseModel):
23 | blocks: List[FlatBlockOutput]
24 | page_info: Dict[int, dict]
25 | metadata: dict
26 |
27 | def collect_images(block: JSONBlockOutput) -> dict[str, str]:
28 | if not getattr(block, "children", None):
29 | return block.images or {}
30 | else:
31 | images = block.images or {}
32 | for child_block in block.children:
33 | images.update(collect_images(child_block))
34 | return images
35 |
36 | def assemble_html_with_images(block: JSONBlockOutput, image_blocks: set[str]) -> str:
37 | if not getattr(block, "children", None):
38 | if block.block_type in image_blocks:
39 | return f"<p>{block.html}<img src='{block.id}'></p>"
40 | else:
41 | return block.html
42 |
43 | child_html = [assemble_html_with_images(child, image_blocks) for child in block.children]
44 | child_ids = [child.id for child in block.children]
45 |
46 | soup = BeautifulSoup(block.html, "html.parser")
47 | content_refs = soup.find_all("content-ref")
48 | for ref in content_refs:
49 | src_id = ref.attrs["src"]
50 | if src_id in child_ids:
51 | ref.replace_with(child_html[child_ids.index(src_id)])
52 |
53 | return html.unescape(str(soup))
54 |
55 | def json_to_chunks(
56 | block: JSONBlockOutput, image_blocks: set[str], page_id: int=0) -> FlatBlockOutput | List[FlatBlockOutput]:
57 | if block.block_type == "Page":
58 | children = block.children
59 | page_id = int(block.id.split("/")[-1])
60 | return [json_to_chunks(child, image_blocks, page_id=page_id) for child in children]
61 | else:
62 | return FlatBlockOutput(
63 | id=block.id,
64 | block_type=block.block_type,
65 | html=assemble_html_with_images(block, image_blocks),
66 | page=page_id,
67 | polygon=block.polygon,
68 | bbox=block.bbox,
69 | section_hierarchy=block.section_hierarchy,
70 | images=collect_images(block),
71 | )
72 |
73 |
74 | class ChunkRenderer(JSONRenderer):
75 |
76 | def __call__(self, document: Document) -> ChunkOutput:
77 | document_output = document.render(self.block_config)
78 | json_output = []
79 | for page_output in document_output.children:
80 | json_output.append(self.extract_json(document, page_output))
81 |
82 | # This will get the top-level blocks from every page
83 | chunk_output = []
84 | for item in json_output:
85 | chunks = json_to_chunks(item, set([str(block) for block in self.image_blocks]))
86 | chunk_output.extend(chunks)
87 |
88 | page_info = {
89 | page.page_id: {"bbox": page.polygon.bbox, "polygon": page.polygon.polygon}
90 | for page in document.pages
91 | }
92 |
93 | return ChunkOutput(
94 | blocks=chunk_output,
95 | page_info=page_info,
96 | metadata=self.generate_document_metadata(document, document_output),
97 | )
98 |
```
--------------------------------------------------------------------------------
/marker/schema/document.py:
--------------------------------------------------------------------------------
```python
1 | from __future__ import annotations
2 |
3 | from typing import List, Sequence, Optional
4 |
5 | from pydantic import BaseModel
6 |
7 | from marker.schema import BlockTypes
8 | from marker.schema.blocks import Block, BlockId, BlockOutput
9 | from marker.schema.groups.page import PageGroup
10 |
11 |
12 | class DocumentOutput(BaseModel):
13 | children: List[BlockOutput]
14 | html: str
15 | block_type: BlockTypes = BlockTypes.Document
16 |
17 |
18 | class TocItem(BaseModel):
19 | title: str
20 | heading_level: int
21 | page_id: int
22 | polygon: List[List[float]]
23 |
24 |
25 | class Document(BaseModel):
26 | filepath: str
27 | pages: List[PageGroup]
28 | block_type: BlockTypes = BlockTypes.Document
29 | table_of_contents: List[TocItem] | None = None
30 | debug_data_path: str | None = None # Path that debug data was saved to
31 |
32 | def get_block(self, block_id: BlockId):
33 | page = self.get_page(block_id.page_id)
34 | block = page.get_block(block_id)
35 | if block:
36 | return block
37 | return None
38 |
39 | def get_page(self, page_id):
40 | for page in self.pages:
41 | if page.page_id == page_id:
42 | return page
43 | return None
44 |
45 | def get_next_block(
46 | self, block: Block, ignored_block_types: List[BlockTypes] = None
47 | ):
48 | if ignored_block_types is None:
49 | ignored_block_types = []
50 | next_block = None
51 |
52 | # Try to find the next block in the current page
53 | page = self.get_page(block.page_id)
54 | next_block = page.get_next_block(block, ignored_block_types)
55 | if next_block:
56 | return next_block
57 |
58 | # If no block found, search subsequent pages
59 | for page in self.pages[self.pages.index(page) + 1 :]:
60 | next_block = page.get_next_block(None, ignored_block_types)
61 | if next_block:
62 | return next_block
63 | return None
64 |
65 | def get_next_page(self, page: PageGroup):
66 | page_idx = self.pages.index(page)
67 | if page_idx + 1 < len(self.pages):
68 | return self.pages[page_idx + 1]
69 | return None
70 |
71 | def get_prev_block(self, block: Block):
72 | page = self.get_page(block.page_id)
73 | prev_block = page.get_prev_block(block)
74 | if prev_block:
75 | return prev_block
76 | prev_page = self.get_prev_page(page)
77 | if not prev_page:
78 | return None
79 | return prev_page.get_block(prev_page.structure[-1])
80 |
81 | def get_prev_page(self, page: PageGroup):
82 | page_idx = self.pages.index(page)
83 | if page_idx > 0:
84 | return self.pages[page_idx - 1]
85 | return None
86 |
87 | def assemble_html(
88 | self, child_blocks: List[Block], block_config: Optional[dict] = None
89 | ):
90 | template = ""
91 | for c in child_blocks:
92 | template += f"<content-ref src='{c.id}'></content-ref>"
93 | return template
94 |
95 | def render(self, block_config: Optional[dict] = None):
96 | child_content = []
97 | section_hierarchy = None
98 | for page in self.pages:
99 | rendered = page.render(self, None, section_hierarchy, block_config)
100 | section_hierarchy = rendered.section_hierarchy.copy()
101 | child_content.append(rendered)
102 |
103 | return DocumentOutput(
104 | children=child_content,
105 | html=self.assemble_html(child_content, block_config),
106 | )
107 |
108 | def contained_blocks(self, block_types: Sequence[BlockTypes] = None) -> List[Block]:
109 | blocks = []
110 | for page in self.pages:
111 | blocks += page.contained_blocks(self, block_types)
112 | return blocks
113 |
```
--------------------------------------------------------------------------------
/benchmarks/overall/scorers/heuristic.py:
--------------------------------------------------------------------------------
```python
1 | from typing import List
2 |
3 | from rapidfuzz import fuzz
4 |
5 | from benchmarks.overall.scorers.clean import MarkdownCleaner
6 | from benchmarks.overall.scorers.schema import BlockScores
7 | from benchmarks.overall.scorers import BaseScorer
8 |
9 |
10 | class HeuristicScorer(BaseScorer):
11 | def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
12 | if not method_markdown:
13 | return {
14 | "score": 0,
15 | "specific_scores": {
16 | "order": 0,
17 | "by_block": [0] * len(gt_markdown)
18 | }
19 | }
20 |
21 | # Standardize inputs
22 | gt_markdown = [self.clean_input(block) for block in gt_markdown]
23 | method_markdown = self.clean_input(method_markdown)
24 |
25 | alignments = self.find_fuzzy_alignments(method_markdown, gt_markdown)
26 | scores = [alignment["score"] for alignment in alignments]
27 |
28 | # Find order score
29 | orders = [alignment["start"] for alignment in alignments]
30 | correct_order = list(range(len(gt_markdown)))
31 | actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x])
32 | order_score = self.kendall_tau(correct_order, actual_order)
33 |
34 | # Weight score by sequence length
35 | gt_weights = [len(g) for g in gt_markdown]
36 | weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
37 |
38 | # Weight the score by sequence length
39 | overall_score = sum(weighted_scores) / max(1, sum(gt_weights))
40 | overall_score = overall_score * 0.8 + order_score * 0.2
41 | return {
42 | "score": overall_score,
43 | "specific_scores": {
44 | "order": order_score,
45 | "by_block": scores
46 | },
47 | }
48 |
49 | @staticmethod
50 | def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
51 | n = len(correct_order)
52 | concordant = 0
53 | discordant = 0
54 |
55 | if n <= 1:
56 | return 100
57 |
58 | for i in range(n):
59 | for j in range(i + 1, n):
60 | correct_sign = correct_order[i] - correct_order[j]
61 | actual_sign = actual_order[i] - actual_order[j]
62 |
63 | if (correct_sign > 0 and actual_sign > 0) or (correct_sign < 0 and actual_sign < 0):
64 | concordant += 1
65 | elif (correct_sign < 0 and actual_sign > 0) or (correct_sign > 0 and actual_sign < 0):
66 | discordant += 1
67 |
68 | total_pairs = (n * (n - 1)) // 2
69 | tau = (concordant - discordant) / total_pairs
70 | tau = (tau + 1) / 2 # 0-1 scale
71 | return tau * 100 # 0-100 scale
72 |
73 | @staticmethod
74 | def find_fuzzy_alignments(
75 | main_string: str,
76 | substrings: List[str],
77 | threshold: int = 70
78 | ) -> List[dict]:
79 | alignments = []
80 |
81 | for idx, substr in enumerate(substrings):
82 | result = fuzz.partial_ratio_alignment(substr, main_string, score_cutoff=threshold)
83 |
84 | score = 0
85 | dest_start = 0
86 | dest_end = 0
87 | if result:
88 | score = result.score
89 | dest_start = result.dest_start
90 | dest_end = result.dest_end
91 |
92 | alignments.append({
93 | "string": substr,
94 | "start": dest_start,
95 | "end": dest_end,
96 | "score": score,
97 | "idx": idx
98 | })
99 | return alignments
100 |
101 |
102 | @staticmethod
103 | def clean_input(md: str):
104 | cleaner = MarkdownCleaner()
105 | return cleaner(md)
```
--------------------------------------------------------------------------------
/marker/output.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | import os
3 |
4 | from bs4 import BeautifulSoup, Tag
5 | from pydantic import BaseModel
6 | from PIL import Image
7 |
8 | from marker.renderers.extraction import ExtractionOutput
9 | from marker.renderers.html import HTMLOutput
10 | from marker.renderers.json import JSONOutput, JSONBlockOutput
11 | from marker.renderers.markdown import MarkdownOutput
12 | from marker.renderers.ocr_json import OCRJSONOutput
13 | from marker.schema.blocks import BlockOutput
14 | from marker.settings import settings
15 |
16 |
17 | def unwrap_outer_tag(html: str):
18 | soup = BeautifulSoup(html, "html.parser")
19 | contents = list(soup.contents)
20 | if len(contents) == 1 and isinstance(contents[0], Tag) and contents[0].name == "p":
21 | # Unwrap the p tag
22 | soup.p.unwrap()
23 |
24 | return str(soup)
25 |
26 |
27 | def json_to_html(block: JSONBlockOutput | BlockOutput):
28 | # Utility function to take in json block output and give html for the block.
29 | if not getattr(block, "children", None):
30 | return block.html
31 | else:
32 | child_html = [json_to_html(child) for child in block.children]
33 | child_ids = [child.id for child in block.children]
34 |
35 | soup = BeautifulSoup(block.html, "html.parser")
36 | content_refs = soup.find_all("content-ref")
37 | for ref in content_refs:
38 | src_id = ref.attrs["src"]
39 | if src_id in child_ids:
40 | child_soup = BeautifulSoup(
41 | child_html[child_ids.index(src_id)], "html.parser"
42 | )
43 | ref.replace_with(child_soup)
44 | return str(soup)
45 |
46 |
47 | def output_exists(output_dir: str, fname_base: str):
48 | exts = ["md", "html", "json"]
49 | for ext in exts:
50 | if os.path.exists(os.path.join(output_dir, f"{fname_base}.{ext}")):
51 | return True
52 | return False
53 |
54 |
55 | def text_from_rendered(rendered: BaseModel):
56 | from marker.renderers.chunk import ChunkOutput # Has an import from this file
57 |
58 | if isinstance(rendered, MarkdownOutput):
59 | return rendered.markdown, "md", rendered.images
60 | elif isinstance(rendered, HTMLOutput):
61 | return rendered.html, "html", rendered.images
62 | elif isinstance(rendered, JSONOutput):
63 | return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
64 | elif isinstance(rendered, ChunkOutput):
65 | return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
66 | elif isinstance(rendered, OCRJSONOutput):
67 | return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
68 | elif isinstance(rendered, ExtractionOutput):
69 | return rendered.document_json, "json", {}
70 | else:
71 | raise ValueError("Invalid output type")
72 |
73 |
74 | def convert_if_not_rgb(image: Image.Image) -> Image.Image:
75 | if image.mode != "RGB":
76 | image = image.convert("RGB")
77 | return image
78 |
79 |
80 | def save_output(rendered: BaseModel, output_dir: str, fname_base: str):
81 | text, ext, images = text_from_rendered(rendered)
82 | text = text.encode(settings.OUTPUT_ENCODING, errors="replace").decode(
83 | settings.OUTPUT_ENCODING
84 | )
85 |
86 | with open(
87 | os.path.join(output_dir, f"{fname_base}.{ext}"),
88 | "w+",
89 | encoding=settings.OUTPUT_ENCODING,
90 | ) as f:
91 | f.write(text)
92 | with open(
93 | os.path.join(output_dir, f"{fname_base}_meta.json"),
94 | "w+",
95 | encoding=settings.OUTPUT_ENCODING,
96 | ) as f:
97 | f.write(json.dumps(rendered.metadata, indent=2))
98 |
99 | for img_name, img in images.items():
100 | img = convert_if_not_rgb(img) # RGBA images can't save as JPG
101 | img.save(os.path.join(output_dir, img_name), settings.OUTPUT_IMAGE_FORMAT)
102 |
```
--------------------------------------------------------------------------------
/marker/processors/block_relabel.py:
--------------------------------------------------------------------------------
```python
1 | from copy import deepcopy
2 | from typing import Annotated
3 |
4 | from marker.processors import BaseProcessor
5 | from marker.schema import BlockTypes
6 | from marker.schema.blocks import BlockId
7 | from marker.schema.document import Document
8 | from marker.schema.registry import get_block_class
9 |
10 | from marker.logger import get_logger
11 | logger = get_logger()
12 |
13 | class BlockRelabelProcessor(BaseProcessor):
14 | """
15 | A processor to heuristically relabel blocks based on a confidence threshold.
16 |
17 | Each rule in the relabel string maps an original block label to a new one
18 | if the confidence exceeds a given threshold.
19 | """
20 |
21 | block_relabel_str: Annotated[
22 | str,
23 | "Comma-separated relabeling rules in the format '<original_label>:<new_label>:<confidence_threshold>'.",
24 | "Each rule defines how blocks of a certain type should be relabeled when the confidence exceeds the threshold.",
25 | "Example: 'Table:Picture:0.85,Form:Picture:0.9'"
26 | ] = ""
27 |
28 | def __init__(self, config=None):
29 | super().__init__(config)
30 | self.block_relabel_map = {}
31 |
32 | if not self.block_relabel_str:
33 | return
34 |
35 | for i, block_config_str in enumerate(self.block_relabel_str.split(',')):
36 | block_config_str = block_config_str.strip()
37 | if not block_config_str:
38 | continue # Skip empty segments
39 |
40 | try:
41 | parts = block_config_str.split(':')
42 | if len(parts) != 3:
43 | raise ValueError(f"Expected 3 parts, got {len(parts)}")
44 |
45 | block_label, block_relabel, confidence_str = parts
46 | confidence_thresh = float(confidence_str)
47 |
48 | block_type = BlockTypes[block_label]
49 | relabel_block_type = BlockTypes[block_relabel]
50 |
51 | self.block_relabel_map[block_type] = (
52 | confidence_thresh,
53 | relabel_block_type
54 | )
55 | except Exception as e:
56 | logger.warning(f"Failed to parse relabel rule '{block_config_str}' at index {i}: {e}. Expected format is <original_label>:<new_label>:<confidence_threshold>")
57 |
58 | def __call__(self, document: Document):
59 | if len(self.block_relabel_map) == 0:
60 | return
61 |
62 | for page in document.pages:
63 | for block in page.structure_blocks(document):
64 | if block.block_type not in self.block_relabel_map:
65 | continue
66 |
67 | block_id = BlockId(page_id=page.page_id, block_id=block.block_id, block_type=block.block_type)
68 | confidence_thresh, relabel_block_type = self.block_relabel_map[block.block_type]
69 | confidence = block.top_k.get(block.block_type)
70 | if confidence > confidence_thresh:
71 | logger.debug(f"Skipping relabel for {block_id}; Confidence: {confidence} > Confidence Threshold {confidence_thresh} for re-labelling")
72 | continue
73 |
74 | new_block_cls = get_block_class(relabel_block_type)
75 | new_block = new_block_cls(
76 | polygon=deepcopy(block.polygon),
77 | page_id=block.page_id,
78 | structure=deepcopy(block.structure),
79 | text_extraction_method=block.text_extraction_method,
80 | source="heuristics",
81 | top_k=block.top_k,
82 | metadata=block.metadata
83 | )
84 | page.replace_block(block, new_block)
85 | logger.debug(f"Relabelled {block_id} to {relabel_block_type}")
```
--------------------------------------------------------------------------------
/marker/providers/spreadsheet.py:
--------------------------------------------------------------------------------
```python
1 | import os
2 | import tempfile
3 |
4 | from marker.providers.pdf import PdfProvider
5 |
6 | css = '''
7 | @page {
8 | size: A4 landscape;
9 | margin: 1.5cm;
10 | }
11 |
12 | table {
13 | width: 100%;
14 | border-collapse: collapse;
15 | break-inside: auto;
16 | font-size: 10pt;
17 | }
18 |
19 | tr {
20 | break-inside: avoid;
21 | page-break-inside: avoid;
22 | }
23 |
24 | td {
25 | border: 0.75pt solid #000;
26 | padding: 6pt;
27 | }
28 | '''
29 |
30 |
31 | class SpreadSheetProvider(PdfProvider):
32 | def __init__(self, filepath: str, config=None):
33 | temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=f".pdf")
34 | self.temp_pdf_path = temp_pdf.name
35 | temp_pdf.close()
36 |
37 | # Convert XLSX to PDF
38 | try:
39 | self.convert_xlsx_to_pdf(filepath)
40 | except Exception as e:
41 | raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")
42 |
43 | # Initialize the PDF provider with the temp pdf path
44 | super().__init__(self.temp_pdf_path, config)
45 |
46 | def __del__(self):
47 | if os.path.exists(self.temp_pdf_path):
48 | os.remove(self.temp_pdf_path)
49 |
50 | def convert_xlsx_to_pdf(self, filepath: str):
51 | from weasyprint import CSS, HTML
52 | from openpyxl import load_workbook
53 |
54 | html = ""
55 | workbook = load_workbook(filepath)
56 | if workbook is not None:
57 | for sheet_name in workbook.sheetnames:
58 | sheet = workbook[sheet_name]
59 | html += f'<div><h1>{sheet_name}</h1>' + self._excel_to_html_table(sheet) + '</div>'
60 | else:
61 | raise ValueError("Invalid XLSX file")
62 |
63 | # We convert the HTML into a PDF
64 | HTML(string=html).write_pdf(
65 | self.temp_pdf_path,
66 | stylesheets=[CSS(string=css), self.get_font_css()]
67 | )
68 |
69 | @staticmethod
70 | def _get_merged_cell_ranges(sheet):
71 | merged_info = {}
72 | for merged_range in sheet.merged_cells.ranges:
73 | min_col, min_row, max_col, max_row = merged_range.bounds
74 | merged_info[(min_row, min_col)] = {
75 | 'rowspan': max_row - min_row + 1,
76 | 'colspan': max_col - min_col + 1,
77 | 'range': merged_range
78 | }
79 | return merged_info
80 |
81 | def _excel_to_html_table(self, sheet):
82 | merged_cells = self._get_merged_cell_ranges(sheet)
83 |
84 | html = f'<table>'
85 |
86 | # Track cells we should skip due to being part of a merge range
87 | skip_cells = set()
88 |
89 | for row_idx, row in enumerate(sheet.rows, 1):
90 | html += '<tr>'
91 | for col_idx, cell in enumerate(row, 1):
92 | if (row_idx, col_idx) in skip_cells:
93 | continue
94 |
95 | # Check if this cell is the start of a merged range
96 | merge_info = merged_cells.get((row_idx, col_idx))
97 | if merge_info:
98 | # Add cells to skip
99 | for r in range(row_idx, row_idx + merge_info['rowspan']):
100 | for c in range(col_idx, col_idx + merge_info['colspan']):
101 | if (r, c) != (row_idx, col_idx):
102 | skip_cells.add((r, c))
103 |
104 | # Add merged cell with rowspan/colspan
105 | value = cell.value if cell.value is not None else ''
106 | html += f'<td rowspan="{merge_info["rowspan"]}" colspan="{merge_info["colspan"]}">{value}'
107 | else:
108 | # Regular cell
109 | value = cell.value if cell.value is not None else ''
110 | html += f'<td>{value}'
111 |
112 | html += '</td>'
113 | html += '</tr>'
114 | html += '</table>'
115 | return html
116 |
```
--------------------------------------------------------------------------------
/marker/processors/ignoretext.py:
--------------------------------------------------------------------------------
```python
1 | import re
2 | from collections import Counter
3 | from itertools import groupby
4 | from typing import Annotated, List
5 |
6 | from rapidfuzz import fuzz
7 |
8 | from marker.processors import BaseProcessor
9 | from marker.schema import BlockTypes
10 | from marker.schema.blocks import Block
11 | from marker.schema.document import Document
12 |
13 |
14 | class IgnoreTextProcessor(BaseProcessor):
15 | """
16 | A processor for identifying and ignoring common text blocks in a document.
17 | These blocks often represent repetitive or non-essential elements, such as headers, footers, or page numbers.
18 | """
19 | block_types = (
20 | BlockTypes.Text, BlockTypes.SectionHeader,
21 | BlockTypes.TextInlineMath
22 | )
23 | common_element_threshold: Annotated[
24 | float,
25 | "The minimum ratio of pages a text block must appear on to be considered a common element.",
26 | "Blocks that meet or exceed this threshold are marked as common elements.",
27 | ] = 0.2
28 | common_element_min_blocks: Annotated[
29 | int,
30 | "The minimum number of occurrences of a text block within a document to consider it a common element.",
31 | "This ensures that rare blocks are not mistakenly flagged.",
32 | ] = 3
33 | max_streak: Annotated[
34 | int,
35 | "The maximum number of consecutive occurrences of a text block allowed before it is classified as a common element.",
36 | "Helps to identify patterns like repeated headers or footers.",
37 | ] = 3
38 | text_match_threshold: Annotated[
39 | int,
40 | "The minimum fuzzy match score (0-100) required to classify a text block as similar to a common element.",
41 | "Higher values enforce stricter matching.",
42 | ] = 90
43 |
44 | def __call__(self, document: Document):
45 | first_blocks = []
46 | last_blocks = []
47 | for page in document.pages:
48 | initial_block = None
49 | last_block = None
50 | for block in page.contained_blocks(document, self.block_types):
51 | if block.structure is not None:
52 | if initial_block is None:
53 | initial_block = block
54 |
55 | last_block = block
56 |
57 | if initial_block is not None:
58 | first_blocks.append(initial_block)
59 | if last_block is not None:
60 | last_blocks.append(last_block)
61 |
62 | self.filter_common_elements(document, first_blocks)
63 | self.filter_common_elements(document, last_blocks)
64 |
65 | @staticmethod
66 | def clean_text(text):
67 | text = text.replace("\n", "").strip()
68 | text = re.sub(r"^\d+\s*", "", text) # remove numbers at the start of the line
69 | text = re.sub(r"\s*\d+$", "", text) # remove numbers at the end of the line
70 | return text
71 |
72 | def filter_common_elements(self, document, blocks: List[Block]):
73 | # We can't filter if we don't have enough pages to find common elements
74 | if len(blocks) < self.common_element_min_blocks:
75 | return
76 |
77 | text = [self.clean_text(b.raw_text(document)) for b in blocks]
78 |
79 | streaks = {}
80 | for key, group in groupby(text):
81 | streaks[key] = max(streaks.get(key, 0), len(list(group)))
82 |
83 | counter = Counter(text)
84 | common = [
85 | k for k, v in counter.items()
86 | if (v >= len(blocks) * self.common_element_threshold or streaks[k] >= self.max_streak)
87 | and v > self.common_element_min_blocks
88 | ]
89 | if len(common) == 0:
90 | return
91 |
92 | for t, b in zip(text, blocks):
93 | # Check against all common elements
94 | if any(fuzz.ratio(t, common_element) > self.text_match_threshold for common_element in common):
95 | b.ignore_for_output = True
96 |
```
--------------------------------------------------------------------------------
/marker/processors/llm/llm_complex.py:
--------------------------------------------------------------------------------
```python
1 | from typing import List
2 |
3 | import markdown2
4 | from pydantic import BaseModel
5 |
6 | from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor
7 |
8 | from marker.schema import BlockTypes
9 | from marker.schema.document import Document
10 |
11 |
12 | class LLMComplexRegionProcessor(BaseLLMSimpleBlockProcessor):
13 | block_types = (BlockTypes.ComplexRegion,)
14 | complex_region_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
15 | You will receive an image of a text block and the text that can be extracted from the image.
16 | Your task is to generate markdown to properly represent the content of the image. Do not omit any text present in the image - make sure everything is included in the markdown representation. The markdown representation should be as faithful to the original image as possible.
17 |
18 | Formatting should be in markdown, with the following rules:
19 | - * for italics, ** for bold, and ` for inline code.
20 | - Use <sup>...</sup> for superscripts.
21 | - Headers should be formatted with #, with one # for the largest header, and up to 6 for the smallest.
22 | - Lists should be formatted with either - or 1. for unordered and ordered lists, respectively.
23 | - Links should be formatted with [text](url).
24 | - Use ``` for code blocks.
25 | - Inline math should be formatted with <math>math expression</math>.
26 | - Display math should be formatted with <math display="block">math expression</math>.
27 | - Values and labels should be extracted from forms, and put into markdown tables, with the labels on the left side, and values on the right. The headers should be "Labels" and "Values". Other text in the form can appear between the tables.
28 | - Tables should be formatted with markdown tables, with the headers bolded.
29 |
30 | **Instructions:**
31 | 1. Carefully examine the provided block image.
32 | 2. Analyze the existing text representation.
33 | 3. Generate the markdown representation of the content in the image.
34 | **Example:**
35 | Input:
36 | ```text
37 | Table 1: Car Sales
38 | ```
39 | Output:
40 | ```markdown
41 | ## Table 1: Car Sales
42 |
43 | | Car | Sales |
44 | | --- | --- |
45 | | Honda | 100 |
46 | | Toyota | 200 |
47 | ```
48 | **Input:**
49 | ```text
50 | {extracted_text}
51 | ```
52 | """
53 |
54 | def block_prompts(self, document: Document) -> List[PromptData]:
55 | prompt_data = []
56 | for block in self.inference_blocks(document):
57 | text = block["block"].raw_text(document)
58 | prompt = self.complex_region_prompt.replace("{extracted_text}", text)
59 | image = self.extract_image(document, block["block"])
60 | prompt_data.append({
61 | "prompt": prompt,
62 | "image": image,
63 | "block": block["block"],
64 | "schema": ComplexSchema,
65 | "page": block["page"]
66 | })
67 | return prompt_data
68 |
69 | def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
70 | block = prompt_data["block"]
71 | text = block.raw_text(document)
72 |
73 | if not response or "corrected_markdown" not in response:
74 | block.update_metadata(llm_error_count=1)
75 | return
76 |
77 | corrected_markdown = response["corrected_markdown"]
78 |
79 | # The original table is okay
80 | if "no corrections" in corrected_markdown.lower():
81 | return
82 |
83 | # Potentially a partial response
84 | if len(corrected_markdown) < len(text) * .5:
85 | block.update_metadata(llm_error_count=1)
86 | return
87 |
88 | # Convert LLM markdown to html
89 | corrected_markdown = corrected_markdown.strip().lstrip("```markdown").rstrip("```").strip()
90 | block.html = markdown2.markdown(corrected_markdown, extras=["tables"])
91 |
92 | class ComplexSchema(BaseModel):
93 | corrected_markdown: str
```
--------------------------------------------------------------------------------
/tests/converters/test_pdf_converter.py:
--------------------------------------------------------------------------------
```python
1 | import io
2 |
3 | import pytest
4 | from marker.converters.pdf import PdfConverter
5 | from marker.renderers.markdown import MarkdownOutput
6 |
7 |
8 | @pytest.mark.output_format("markdown")
9 | @pytest.mark.config({"page_range": [0, 1, 2, 3, 7], "disable_ocr": True})
10 | def test_pdf_converter(pdf_converter: PdfConverter, temp_doc):
11 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
12 | markdown = markdown_output.markdown
13 |
14 | # Basic assertions
15 | assert len(markdown) > 0
16 | assert "# Subspace Adversarial Training" in markdown
17 |
18 | # Some assertions for line joining across pages
19 | assert (
20 | "AT solutions. However, these methods highly rely on specifically" in markdown
21 | ) # pgs: 1-2
22 | assert (
23 | "(with adversarial perturbations), which harms natural accuracy, " in markdown
24 | ) # pgs: 3-4
25 |
26 | # Some assertions for line joining across columns
27 | assert "remain similar across a wide range of choices." in markdown # pg: 2
28 | assert "a new scheme for designing more robust and efficient" in markdown # pg: 8
29 |
30 |
31 | @pytest.mark.filename("manual.epub")
32 | @pytest.mark.config({"page_range": [0]})
33 | def test_epub_converter(pdf_converter: PdfConverter, temp_doc):
34 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
35 | markdown = markdown_output.markdown
36 |
37 | # Basic assertions
38 | assert "Simple Sabotage Field Manual" in markdown
39 |
40 |
41 | @pytest.mark.filename("single_sheet.xlsx")
42 | @pytest.mark.config({"page_range": [0]})
43 | def test_xlsx_converter(pdf_converter: PdfConverter, temp_doc):
44 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
45 | markdown = markdown_output.markdown
46 |
47 | # Basic assertions
48 | assert "four" in markdown
49 |
50 |
51 | @pytest.mark.filename("china.html")
52 | @pytest.mark.config({"page_range": [10]})
53 | def test_html_converter(pdf_converter: PdfConverter, temp_doc):
54 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
55 | markdown = markdown_output.markdown
56 |
57 | # Basic assertions
58 | assert "Republic of China" in markdown
59 |
60 |
61 | @pytest.mark.filename("gatsby.docx")
62 | @pytest.mark.config({"page_range": [0]})
63 | def test_docx_converter(pdf_converter: PdfConverter, temp_doc):
64 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
65 | markdown = markdown_output.markdown
66 |
67 | # Basic assertions
68 | assert "The Decline of the American Dream in the 1920s" in markdown
69 |
70 |
71 | @pytest.mark.filename("lambda.pptx")
72 | @pytest.mark.config({"page_range": [0]})
73 | def test_pptx_converter(pdf_converter: PdfConverter, temp_doc):
74 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
75 | markdown = markdown_output.markdown
76 |
77 | # Basic assertions
78 | assert "Adam Doupé" in markdown
79 |
80 |
81 | @pytest.mark.output_format("markdown")
82 | @pytest.mark.config({"page_range": [0, 1, 2, 3, 7], "disable_ocr": True})
83 | def test_pdf_converter_bytes(pdf_converter: PdfConverter, temp_doc):
84 | with open(temp_doc.name, "rb") as f:
85 | data = f.read()
86 |
87 | input_bytes = io.BytesIO(data)
88 | markdown_output: MarkdownOutput = pdf_converter(input_bytes)
89 | markdown = markdown_output.markdown
90 |
91 | # Basic assertions
92 | assert len(markdown) > 0
93 | assert "# Subspace Adversarial Training" in markdown
94 |
95 | # Some assertions for line joining across pages
96 | assert (
97 | "AT solutions. However, these methods highly rely on specifically" in markdown
98 | ) # pgs: 1-2
99 | assert (
100 | "(with adversarial perturbations), which harms natural accuracy, " in markdown
101 | ) # pgs: 3-4
102 |
103 | # Some assertions for line joining across columns
104 | assert "remain similar across a wide range of choices." in markdown # pg: 2
105 | assert "a new scheme for designing more robust and efficient" in markdown # pg: 8
106 |
```
--------------------------------------------------------------------------------
/benchmarks/table/table.py:
--------------------------------------------------------------------------------
```python
1 | import os
2 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for an op, which is not supported on MPS
3 |
4 | from pathlib import Path
5 | from itertools import repeat
6 | from typing import List
7 |
8 | import time
9 | import datasets
10 | from tqdm import tqdm
11 | import click
12 | from tabulate import tabulate
13 | import json
14 | from concurrent.futures import ProcessPoolExecutor
15 |
16 | from marker.settings import settings
17 | from benchmarks.table.inference import inference_tables
18 |
19 | from scoring import wrap_table_html, similarity_eval_html
20 |
21 | def update_teds_score(result, prefix: str = "marker"):
22 | prediction, ground_truth = result[f'{prefix}_table'], result['gt_table']
23 | prediction, ground_truth = wrap_table_html(prediction), wrap_table_html(ground_truth)
24 | score = similarity_eval_html(prediction, ground_truth)
25 | result.update({f'{prefix}_score':score})
26 | return result
27 |
28 |
29 | @click.command(help="Benchmark Table to HTML Conversion")
30 | @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "table"), help="Output path for results.")
31 | @click.option("--dataset", type=str, default="datalab-to/fintabnet_bench_marker", help="Dataset to use")
32 | @click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process")
33 | @click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use")
34 | @click.option("--use_llm", is_flag=True, help="Use LLM for improving table recognition.")
35 | @click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.")
36 | @click.option("--use_gemini", is_flag=True, help="Evaluate Gemini for table recognition.")
37 | def main(
38 | result_path: str,
39 | dataset: str,
40 | max_rows: int,
41 | max_workers: int,
42 | use_llm: bool,
43 | table_rec_batch_size: int | None,
44 | use_gemini: bool = False
45 | ):
46 | start = time.time()
47 |
48 |
49 | dataset = datasets.load_dataset(dataset, split='train')
50 | dataset = dataset.shuffle(seed=0)
51 |
52 | results, total_unaligned = inference_tables(dataset, use_llm, table_rec_batch_size, max_rows, use_gemini)
53 |
54 | print(f"Total time: {time.time() - start}.")
55 | print(f"Could not align {total_unaligned} tables from fintabnet.")
56 |
57 | with ProcessPoolExecutor(max_workers=max_workers) as executor:
58 | marker_results = list(
59 | tqdm(
60 | executor.map(update_teds_score, results), desc='Computing alignment scores', total=len(results)
61 | )
62 | )
63 |
64 | avg_score = sum([r["marker_score"] for r in marker_results]) / len(marker_results)
65 | headers = ["Avg score", "Total tables"]
66 | data = [f"{avg_score:.3f}", len(marker_results)]
67 | gemini_results = None
68 | if use_gemini:
69 | with ProcessPoolExecutor(max_workers=max_workers) as executor:
70 | gemini_results = list(
71 | tqdm(
72 | executor.map(update_teds_score, results, repeat("gemini")), desc='Computing Gemini scores',
73 | total=len(results)
74 | )
75 | )
76 | avg_gemini_score = sum([r["gemini_score"] for r in gemini_results]) / len(gemini_results)
77 | headers.append("Avg Gemini score")
78 | data.append(f"{avg_gemini_score:.3f}")
79 |
80 | table = tabulate([data], headers=headers, tablefmt="github")
81 | print(table)
82 | print("Avg score computed by comparing marker predicted HTML with original HTML")
83 |
84 | results = {
85 | "marker": marker_results,
86 | "gemini": gemini_results
87 | }
88 |
89 | out_path = Path(result_path)
90 | out_path.mkdir(parents=True, exist_ok=True)
91 | with open(out_path / "table.json", "w+") as f:
92 | json.dump(results, f, indent=2)
93 |
94 | print(f"Results saved to {out_path}.")
95 |
96 | if __name__ == '__main__':
97 | main()
```
--------------------------------------------------------------------------------
/benchmarks/table/scoring.py:
--------------------------------------------------------------------------------
```python
1 | """"
2 | TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD
3 | """
4 |
5 | import distance
6 | from apted import APTED, Config
7 | from apted.helpers import Tree
8 | from lxml import html
9 | from collections import deque
10 |
11 | def wrap_table_html(table_html:str)->str:
12 | return f'<html><body>{table_html}</body></html>'
13 |
14 | class TableTree(Tree):
15 | def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
16 | self.tag = tag
17 | self.colspan = colspan
18 | self.rowspan = rowspan
19 | self.content = content
20 |
21 | # Sets self.name and self.children
22 | super().__init__(tag, *children)
23 |
24 | def bracket(self):
25 | """Show tree using brackets notation"""
26 | if self.tag == 'td':
27 | result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
28 | (self.tag, self.colspan, self.rowspan, self.content)
29 | else:
30 | result = '"tag": %s' % self.tag
31 | for child in self.children:
32 | result += child.bracket()
33 | return "{{{}}}".format(result)
34 |
35 | class CustomConfig(Config):
36 | @staticmethod
37 | def maximum(*sequences):
38 | return max(map(len, sequences))
39 |
40 | def normalized_distance(self, *sequences):
41 | return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
42 |
43 | def rename(self, node1, node2):
44 | if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
45 | return 1.
46 | if node1.tag == 'td':
47 | if node1.content or node2.content:
48 | return self.normalized_distance(node1.content, node2.content)
49 | return 0.
50 |
51 | def tokenize(node):
52 | """
53 | Tokenizes table cells
54 | """
55 | global __tokens__
56 | __tokens__.append('<%s>' % node.tag)
57 | if node.text is not None:
58 | __tokens__ += list(node.text)
59 | for n in node.getchildren():
60 | tokenize(n)
61 | if node.tag != 'unk':
62 | __tokens__.append('</%s>' % node.tag)
63 | if node.tag != 'td' and node.tail is not None:
64 | __tokens__ += list(node.tail)
65 |
66 | def tree_convert_html(node, convert_cell=False, parent=None):
67 | """
68 | Converts HTML tree to the format required by apted
69 | """
70 | global __tokens__
71 | if node.tag == 'td':
72 | if convert_cell:
73 | __tokens__ = []
74 | tokenize(node)
75 | cell = __tokens__[1:-1].copy()
76 | else:
77 | cell = []
78 | new_node = TableTree(node.tag,
79 | int(node.attrib.get('colspan', '1')),
80 | int(node.attrib.get('rowspan', '1')),
81 | cell, *deque())
82 | else:
83 | new_node = TableTree(node.tag, None, None, None, *deque())
84 | if parent is not None:
85 | parent.children.append(new_node)
86 | if node.tag != 'td':
87 | for n in node.getchildren():
88 | tree_convert_html(n, convert_cell, new_node)
89 | if parent is None:
90 | return new_node
91 |
92 | def similarity_eval_html(pred, true, structure_only=False):
93 | """
94 | Computes TEDS score between the prediction and the ground truth of a given samples
95 | """
96 | pred, true = html.fromstring(pred), html.fromstring(true)
97 | if pred.xpath('body/table') and true.xpath('body/table'):
98 | pred = pred.xpath('body/table')[0]
99 | true = true.xpath('body/table')[0]
100 | n_nodes_pred = len(pred.xpath(".//*"))
101 | n_nodes_true = len(true.xpath(".//*"))
102 | tree_pred = tree_convert_html(pred, convert_cell=not structure_only)
103 | tree_true = tree_convert_html(true, convert_cell=not structure_only)
104 | n_nodes = max(n_nodes_pred, n_nodes_true)
105 | distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
106 | return 1.0 - (float(distance) / n_nodes)
107 | else:
108 | return 0.0
109 |
110 |
```
--------------------------------------------------------------------------------
/marker/schema/text/span.py:
--------------------------------------------------------------------------------
```python
1 | import html
2 | import re
3 | from typing import List, Literal, Optional
4 |
5 | from marker.schema import BlockTypes
6 | from marker.schema.blocks import Block
7 | from marker.util import unwrap_math
8 |
9 |
10 | def cleanup_text(full_text):
11 | full_text = re.sub(r"(\n\s){3,}", "\n\n", full_text)
12 | full_text = full_text.replace("\xa0", " ") # Replace non-breaking spaces
13 | return full_text
14 |
15 |
16 | class Span(Block):
17 | block_type: BlockTypes = BlockTypes.Span
18 | block_description: str = "A span of text inside a line."
19 |
20 | text: str
21 | font: str
22 | font_weight: float
23 | font_size: float
24 | minimum_position: int
25 | maximum_position: int
26 | formats: List[
27 | Literal[
28 | "plain",
29 | "math",
30 | "chemical",
31 | "bold",
32 | "italic",
33 | "highlight",
34 | "subscript",
35 | "superscript",
36 | "small",
37 | "code",
38 | "underline",
39 | ]
40 | ]
41 | has_superscript: bool = False
42 | has_subscript: bool = False
43 | url: Optional[str] = None
44 | html: Optional[str] = None
45 |
46 | @property
47 | def bold(self):
48 | return "bold" in self.formats
49 |
50 | @property
51 | def italic(self):
52 | return "italic" in self.formats
53 |
54 | @property
55 | def math(self):
56 | return "math" in self.formats
57 |
58 | @property
59 | def highlight(self):
60 | return "highlight" in self.formats
61 |
62 | @property
63 | def superscript(self):
64 | return "superscript" in self.formats
65 |
66 | @property
67 | def subscript(self):
68 | return "subscript" in self.formats
69 |
70 | @property
71 | def small(self):
72 | return "small" in self.formats
73 |
74 | @property
75 | def code(self):
76 | return "code" in self.formats
77 |
78 | @property
79 | def underline(self):
80 | return "underline" in self.formats
81 |
82 | def assemble_html(self, document, child_blocks, parent_structure, block_config):
83 | if self.ignore_for_output:
84 | return ""
85 |
86 | if self.html:
87 | return self.html
88 |
89 | text = self.text
90 |
91 | # Remove trailing newlines
92 | replaced_newline = False
93 | while len(text) > 0 and text[-1] in ["\n", "\r"]:
94 | text = text[:-1]
95 | replaced_newline = True
96 |
97 | # Remove leading newlines
98 | while len(text) > 0 and text[0] in ["\n", "\r"]:
99 | text = text[1:]
100 |
101 | if replaced_newline and not text.endswith("-"):
102 | text += " "
103 |
104 | text = text.replace(
105 | "-\n", ""
106 | ) # Remove hyphenated line breaks from the middle of the span
107 | text = html.escape(text)
108 | text = cleanup_text(text)
109 |
110 | if self.has_superscript:
111 | text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", text)
112 |
113 | # Handle full block superscript
114 | if "<sup>" not in text:
115 | text = f"<sup>{text}</sup>"
116 |
117 | if self.url:
118 | text = f"<a href='{self.url}'>{text}</a>"
119 |
120 | # TODO Support multiple formats
121 | if self.italic:
122 | text = f"<i>{text}</i>"
123 | elif self.bold:
124 | text = f"<b>{text}</b>"
125 | elif self.math:
126 | block_envs = ["split", "align", "gather", "multline"]
127 | if any(f"\\begin{{{env}}}" in text for env in block_envs):
128 | display_mode = "block"
129 | else:
130 | display_mode = "inline"
131 | text = f"<math display='{display_mode}'>{text}</math>"
132 | elif self.highlight:
133 | text = f"<mark>{text}</mark>"
134 | elif self.subscript:
135 | text = f"<sub>{text}</sub>"
136 | elif self.superscript:
137 | text = f"<sup>{text}</sup>"
138 | elif self.underline:
139 | text = f"<u>{text}</u>"
140 | elif self.small:
141 | text = f"<small>{text}</small>"
142 | elif self.code:
143 | text = f"<code>{text}</code>"
144 |
145 | text = unwrap_math(text)
146 | return text
147 |
```
--------------------------------------------------------------------------------
/marker/processors/llm/llm_handwriting.py:
--------------------------------------------------------------------------------
```python
1 | import markdown2
2 | from pydantic import BaseModel
3 | from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData
4 |
5 | from marker.schema import BlockTypes
6 | from marker.schema.document import Document
7 |
8 | from typing import Annotated, List
9 |
10 |
11 | class LLMHandwritingProcessor(BaseLLMSimpleBlockProcessor):
12 | block_types = (BlockTypes.Handwriting, BlockTypes.Text)
13 | handwriting_generation_prompt: Annotated[
14 | str,
15 | "The prompt to use for OCRing handwriting.",
16 | "Default is a string containing the Gemini prompt."
17 | ] = """You are an expert editor specializing in accurately reproducing text from images.
18 | You will receive an image of a text block. Your task is to generate markdown to properly represent the content of the image. Do not omit any text present in the image - make sure everything is included in the markdown representation. The markdown representation should be as faithful to the original image as possible.
19 |
20 | Formatting should be in markdown, with the following rules:
21 | - * for italics, ** for bold, and ` for inline code.
22 | - Headers should be formatted with #, with one # for the largest header, and up to 6 for the smallest.
23 | - Lists should be formatted with either - or 1. for unordered and ordered lists, respectively.
24 | - Links should be formatted with [text](url).
25 | - Use ``` for code blocks.
26 | - Inline math should be formatted with <math>math expression</math>.
27 | - Display math should be formatted with <math display="block">math expression</math>.
28 | - Values and labels should be extracted from forms, and put into markdown tables, with the labels on the left side, and values on the right. The headers should be "Labels" and "Values". Other text in the form can appear between the tables.
29 | - Tables should be formatted with markdown tables, with the headers bolded.
30 |
31 | **Instructions:**
32 | 1. Carefully examine the provided block image.
33 | 2. Output the markdown representing the content of the image.
34 | """
35 |
36 | def inference_blocks(self, document: Document) -> List[BlockData]:
37 | blocks = super().inference_blocks(document)
38 | out_blocks = []
39 | for block_data in blocks:
40 | raw_text = block_data["block"].raw_text(document)
41 | block = block_data["block"]
42 |
43 | # Don't process text blocks that contain lines already
44 | if block.block_type == BlockTypes.Text:
45 | lines = block.contained_blocks(document, (BlockTypes.Line,))
46 | if len(lines) > 0 or len(raw_text.strip()) > 0:
47 | continue
48 | out_blocks.append(block_data)
49 | return out_blocks
50 |
51 |
52 | def block_prompts(self, document: Document) -> List[PromptData]:
53 | prompt_data = []
54 | for block_data in self.inference_blocks(document):
55 | block = block_data["block"]
56 | prompt = self.handwriting_generation_prompt
57 | image = self.extract_image(document, block)
58 |
59 | prompt_data.append({
60 | "prompt": prompt,
61 | "image": image,
62 | "block": block,
63 | "schema": HandwritingSchema,
64 | "page": block_data["page"]
65 | })
66 | return prompt_data
67 |
68 | def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
69 | block = prompt_data["block"]
70 | raw_text = block.raw_text(document)
71 |
72 | if not response or "markdown" not in response:
73 | block.update_metadata(llm_error_count=1)
74 | return
75 |
76 | markdown = response["markdown"]
77 | if len(markdown) < len(raw_text) * .5:
78 | block.update_metadata(llm_error_count=1)
79 | return
80 |
81 | markdown = markdown.strip().lstrip("```markdown").rstrip("```").strip()
82 | block.html = markdown2.markdown(markdown, extras=["tables"])
83 |
84 | class HandwritingSchema(BaseModel):
85 | markdown: str
86 |
```
--------------------------------------------------------------------------------
/marker/services/azure_openai.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | import time
3 | from typing import Annotated, List
4 |
5 | import PIL
6 | from marker.logger import get_logger
7 | from openai import AzureOpenAI, APITimeoutError, RateLimitError
8 | from PIL import Image
9 | from pydantic import BaseModel
10 |
11 | from marker.schema.blocks import Block
12 | from marker.services import BaseService
13 |
14 | logger = get_logger()
15 |
16 |
17 | class AzureOpenAIService(BaseService):
18 | azure_endpoint: Annotated[
19 | str, "The Azure OpenAI endpoint URL. No trailing slash."
20 | ] = None
21 | azure_api_key: Annotated[
22 | str, "The API key to use for the Azure OpenAI service."
23 | ] = None
24 | azure_api_version: Annotated[str, "The Azure OpenAI API version to use."] = None
25 | deployment_name: Annotated[
26 | str, "The deployment name for the Azure OpenAI model."
27 | ] = None
28 |
29 | def process_images(self, images: List[PIL.Image.Image]) -> list:
30 | if isinstance(images, Image.Image):
31 | images = [images]
32 |
33 | return [
34 | {
35 | "type": "image_url",
36 | "image_url": {
37 | "url": "data:image/webp;base64,{}".format(self.img_to_base64(img)),
38 | },
39 | }
40 | for img in images
41 | ]
42 |
43 | def __call__(
44 | self,
45 | prompt: str,
46 | image: PIL.Image.Image | List[PIL.Image.Image] | None,
47 | block: Block | None,
48 | response_schema: type[BaseModel],
49 | max_retries: int | None = None,
50 | timeout: int | None = None,
51 | ):
52 | if max_retries is None:
53 | max_retries = self.max_retries
54 |
55 | if timeout is None:
56 | timeout = self.timeout
57 |
58 | client = self.get_client()
59 | image_data = self.format_image_for_llm(image)
60 |
61 | messages = [
62 | {
63 | "role": "user",
64 | "content": [
65 | *image_data,
66 | {"type": "text", "text": prompt},
67 | ],
68 | }
69 | ]
70 |
71 | total_tries = max_retries + 1
72 | for tries in range(1, total_tries + 1):
73 | try:
74 | response = client.beta.chat.completions.parse(
75 | extra_headers={
76 | "X-Title": "Marker",
77 | "HTTP-Referer": "https://github.com/datalab-to/marker",
78 | },
79 | model=self.deployment_name,
80 | messages=messages,
81 | timeout=timeout,
82 | response_format=response_schema,
83 | )
84 | response_text = response.choices[0].message.content
85 | total_tokens = response.usage.total_tokens
86 | if block:
87 | block.update_metadata(
88 | llm_tokens_used=total_tokens, llm_request_count=1
89 | )
90 | return json.loads(response_text)
91 | except (APITimeoutError, RateLimitError) as e:
92 | # Rate limit exceeded
93 | if tries == total_tries:
94 | # Last attempt failed. Give up
95 | logger.error(
96 | f"Rate limit error: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})"
97 | )
98 | break
99 | else:
100 | wait_time = tries * self.retry_wait_time
101 | logger.warning(
102 | f"Rate limit error: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})"
103 | )
104 | time.sleep(wait_time)
105 | except Exception as e:
106 | logger.error(f"Azure OpenAI inference failed: {e}")
107 | break
108 |
109 | return {}
110 |
111 | def get_client(self) -> AzureOpenAI:
112 | return AzureOpenAI(
113 | api_version=self.azure_api_version,
114 | azure_endpoint=self.azure_endpoint,
115 | api_key=self.azure_api_key,
116 | )
117 |
```
--------------------------------------------------------------------------------
/benchmarks/overall/scorers/clean.py:
--------------------------------------------------------------------------------
```python
1 | import re
2 | import subprocess
3 | import tempfile
4 | from pathlib import Path
5 |
6 | import latex2mathml.converter
7 |
8 | class MarkdownCleaner:
9 | def __init__(self):
10 | pass
11 |
12 | def __call__(self, markdown):
13 | markdown = self.normalize_markdown(markdown) # Use pandoc to normalize
14 |
15 | # Replace math expressions with latexml
16 | pattern = r'(?<!\\)\$(?:\$([^$]+)\$\$|\s*([^$\n]+?)\s*\$)'
17 | markdown = re.sub(pattern, self.standardize_math, markdown)
18 |
19 | # Replace image urls with a generic tag
20 | pattern = r'!\[(.*?)\]\((https?://[^\s\)]+)\)'
21 | markdown = re.sub(pattern, r'![link]', markdown)
22 |
23 | # Clean up stray html tags
24 | markdown = markdown.replace("<br>", "\n")
25 | markdown = re.sub(r"<sub>(.*?)</sub>", r"\1", markdown)
26 | markdown = re.sub(r"<sup>(.*?)</sup>", r"\1", markdown)
27 | markdown = re.sub(r"<span.*?>(.*?)</span>", r"\1", markdown) # Remove span tags and keep content
28 |
29 | # Clean up markdown formatting
30 | markdown = re.sub(r"\s+", " ", markdown)
31 | markdown = re.sub(r"\n+", "\n", markdown)
32 | markdown = re.sub("\\.+", ".",
33 | markdown) # Replace repeated periods with a single period, like in table of contents
34 | markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header
35 | markdown = markdown.encode().decode('unicode-escape', errors="ignore") # Decode unicode characters properly
36 | return markdown.strip().lower()
37 |
38 | @staticmethod
39 | def normalize_markdown(md_text: str) -> str:
40 | with tempfile.TemporaryDirectory() as tmp_dir:
41 | dirpath = Path(tmp_dir)
42 | input_file = dirpath / 'input.md'
43 | input_file.write_text(md_text, encoding='utf-8')
44 |
45 | # Markdown to HTML
46 | html_file = dirpath / 'temp.html'
47 | subprocess.run(
48 | [
49 | 'pandoc',
50 | str(input_file),
51 | '-f', 'markdown+tex_math_dollars',
52 | '-t', 'html',
53 | '-o', str(html_file),
54 | '--quiet'
55 | ],
56 | check=True
57 | )
58 |
59 | # HTML to Markdown
60 | output_file = dirpath / 'output.md'
61 | subprocess.run(
62 | [
63 | 'pandoc',
64 | str(html_file),
65 | '-f', 'html',
66 | '-t', 'markdown+tex_math_dollars',
67 | '-o', str(output_file),
68 | '--quiet'
69 | ],
70 | check=True
71 | )
72 |
73 | # Read back the normalized Markdown
74 | normalized_md = output_file.read_text(encoding='utf-8')
75 |
76 | return normalized_md
77 |
78 | def standardize_math(self, match):
79 | try:
80 | delim = "$$" if match.group(0).startswith('$$') else "$"
81 | math_content = match.group(1) or match.group(2)
82 | if delim == "$$":
83 | math_content = latex2mathml.converter.convert(math_content)
84 | else:
85 | math_content = self.clean_latex(math_content)
86 | return f'{delim}{math_content}{delim}'
87 | except Exception as e:
88 | print(f"Failed to standardize math expression: {match.group(0)} with error: {e}")
89 | return match.group(0)
90 |
91 | @staticmethod
92 | def clean_latex(latex_str):
93 | latex_str = re.sub(r'\s+', ' ', latex_str.strip())
94 | for tag in [r'\\text', r'\\mathrm', r'\\mathbf', r'\\textbf']:
95 | latex_str = re.sub(tag + r'\{([^}]+)\}', r'\1', latex_str)
96 |
97 | replacements = {
98 | '\\times': '*',
99 | '\\cdot': '*',
100 | '\\div': '/',
101 | '\\le': '<=',
102 | '\\ge': '>=',
103 | '\\neq': '!=',
104 | '\\to': '\\rightarrow',
105 | }
106 |
107 | for old, new in replacements.items():
108 | latex_str = latex_str.replace(old, new)
109 |
110 | return latex_str
111 |
112 |
113 |
114 |
```
--------------------------------------------------------------------------------
/marker/utils/gpu.py:
--------------------------------------------------------------------------------
```python
1 | import os
2 | import subprocess
3 | import torch
4 |
5 | from marker.logger import get_logger
6 | from marker.settings import settings
7 |
8 | logger = get_logger()
9 |
10 |
11 | class GPUManager:
12 | default_gpu_vram: int = 8
13 |
14 | def __init__(self, device_idx: int):
15 | self.device_idx = device_idx
16 | self.original_compute_mode = None
17 | self.mps_server_process = None
18 |
19 | def __enter__(self):
20 | if self.using_cuda():
21 | self.start_mps_server()
22 | return self
23 |
24 | def __exit__(self, exc_type, exc_val, exc_tb):
25 | if self.using_cuda():
26 | self.cleanup()
27 |
28 | @staticmethod
29 | def using_cuda():
30 | return "cuda" in settings.TORCH_DEVICE_MODEL
31 |
32 | def check_cuda_available(self) -> bool:
33 | if not torch.cuda.is_available():
34 | return False
35 | try:
36 | subprocess.run(["nvidia-smi", "--version"], capture_output=True, check=True)
37 | return True
38 | except (subprocess.CalledProcessError, FileNotFoundError):
39 | return False
40 |
41 | def get_gpu_vram(self):
42 | if not self.using_cuda():
43 | return self.default_gpu_vram
44 |
45 | try:
46 | result = subprocess.run(
47 | [
48 | "nvidia-smi",
49 | "--query-gpu=memory.total",
50 | "--format=csv,noheader,nounits",
51 | "-i",
52 | str(self.device_idx),
53 | ],
54 | capture_output=True,
55 | text=True,
56 | check=True,
57 | )
58 |
59 | vram_mb = int(result.stdout.strip())
60 | vram_gb = int(vram_mb / 1024)
61 | return vram_gb
62 |
63 | except (subprocess.CalledProcessError, ValueError, FileNotFoundError):
64 | return self.default_gpu_vram
65 |
66 | def start_mps_server(self) -> bool:
67 | if not self.check_cuda_available():
68 | return False
69 |
70 | try:
71 | # Set MPS environment with chunk-specific directories
72 | env = os.environ.copy()
73 | pipe_dir = f"/tmp/nvidia-mps-{self.device_idx}"
74 | log_dir = f"/tmp/nvidia-log-{self.device_idx}"
75 | env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
76 | env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
77 |
78 | # Create directories
79 | os.makedirs(pipe_dir, exist_ok=True)
80 | os.makedirs(log_dir, exist_ok=True)
81 |
82 | # Start MPS control daemon
83 | self.mps_server_process = subprocess.Popen(
84 | ["nvidia-cuda-mps-control", "-d"],
85 | env=env,
86 | stdout=subprocess.PIPE,
87 | stderr=subprocess.PIPE,
88 | )
89 |
90 | logger.info(f"Started NVIDIA MPS server for chunk {self.device_idx}")
91 | return True
92 | except (subprocess.CalledProcessError, FileNotFoundError) as e:
93 | logger.warning(
94 | f"Failed to start MPS server for chunk {self.device_idx}: {e}"
95 | )
96 | return False
97 |
98 | def stop_mps_server(self) -> None:
99 | try:
100 | # Stop MPS server
101 | env = os.environ.copy()
102 | env["CUDA_MPS_PIPE_DIRECTORY"] = f"/tmp/nvidia-mps-{self.device_idx}"
103 | env["CUDA_MPS_LOG_DIRECTORY"] = f"/tmp/nvidia-log-{self.device_idx}"
104 |
105 | subprocess.run(
106 | ["nvidia-cuda-mps-control"],
107 | input="quit\n",
108 | text=True,
109 | env=env,
110 | timeout=10,
111 | )
112 |
113 | if self.mps_server_process:
114 | self.mps_server_process.terminate()
115 | try:
116 | self.mps_server_process.wait(timeout=5)
117 | except subprocess.TimeoutExpired:
118 | self.mps_server_process.kill()
119 | self.mps_server_process = None
120 |
121 | logger.info(f"Stopped NVIDIA MPS server for chunk {self.device_idx}")
122 | except Exception as e:
123 | logger.warning(
124 | f"Failed to stop MPS server for chunk {self.device_idx}: {e}"
125 | )
126 |
127 | def cleanup(self) -> None:
128 | self.stop_mps_server()
129 |
```
--------------------------------------------------------------------------------
/marker/processors/sectionheader.py:
--------------------------------------------------------------------------------
```python
1 | import warnings
2 | from typing import Annotated, Dict, List
3 |
4 | import numpy as np
5 | from sklearn.cluster import KMeans
6 | from sklearn.exceptions import ConvergenceWarning
7 |
8 | from marker.processors import BaseProcessor
9 | from marker.schema import BlockTypes
10 | from marker.schema.document import Document
11 |
12 | # Ignore sklearn warning about not converging
13 | warnings.filterwarnings("ignore", category=ConvergenceWarning)
14 |
15 |
16 | class SectionHeaderProcessor(BaseProcessor):
17 | """
18 | A processor for recognizing section headers in the document.
19 | """
20 | block_types = (BlockTypes.SectionHeader, )
21 | level_count: Annotated[
22 | int,
23 | "The number of levels to use for headings.",
24 | ] = 4
25 | merge_threshold: Annotated[
26 | float,
27 | "The minimum gap between headings to consider them part of the same group.",
28 | ] = 0.25
29 | default_level: Annotated[
30 | int,
31 | "The default heading level to use if no heading level is detected.",
32 | ] = 2
33 | height_tolerance: Annotated[
34 | float,
35 | "The minimum height of a heading to consider it a heading.",
36 | ] = 0.99
37 |
38 | def __call__(self, document: Document):
39 | line_heights: Dict[int, float] = {}
40 | for page in document.pages:
41 | # Iterate children to grab all section headers
42 | for block in page.children:
43 | if block.block_type not in self.block_types:
44 | continue
45 | if block.structure is not None:
46 | line_heights[block.id] = block.line_height(document)
47 | else:
48 | line_heights[block.id] = 0
49 | block.ignore_for_output = True # Don't output an empty section header
50 |
51 | flat_line_heights = list(line_heights.values())
52 | heading_ranges = self.bucket_headings(flat_line_heights)
53 |
54 | for page in document.pages:
55 | # Iterate children to grab all section headers
56 | for block in page.children:
57 | if block.block_type not in self.block_types:
58 | continue
59 | block_height = line_heights.get(block.id, 0)
60 | if block_height > 0:
61 | for idx, (min_height, max_height) in enumerate(heading_ranges):
62 | if block_height >= min_height * self.height_tolerance:
63 | block.heading_level = idx + 1
64 | break
65 |
66 | if block.heading_level is None:
67 | block.heading_level = self.default_level
68 |
69 | def bucket_headings(self, line_heights: List[float], num_levels=4):
70 | if len(line_heights) <= self.level_count:
71 | return []
72 |
73 | data = np.asarray(line_heights).reshape(-1, 1)
74 | labels = KMeans(n_clusters=num_levels, random_state=0, n_init="auto").fit_predict(data)
75 | data_labels = np.concatenate([data, labels.reshape(-1, 1)], axis=1)
76 | data_labels = np.sort(data_labels, axis=0)
77 |
78 | cluster_means = {int(label): float(np.mean(data_labels[data_labels[:, 1] == label, 0])) for label in np.unique(labels)}
79 | label_max = None
80 | label_min = None
81 | heading_ranges = []
82 | prev_cluster = None
83 | for row in data_labels:
84 | value, label = row
85 | value = float(value)
86 | label = int(label)
87 | if prev_cluster is not None and label != prev_cluster:
88 | prev_cluster_mean = cluster_means[prev_cluster]
89 | cluster_mean = cluster_means[label]
90 | if cluster_mean * self.merge_threshold < prev_cluster_mean:
91 | heading_ranges.append((label_min, label_max))
92 | label_min = None
93 | label_max = None
94 |
95 | label_min = value if label_min is None else min(label_min, value)
96 | label_max = value if label_max is None else max(label_max, value)
97 | prev_cluster = label
98 |
99 | if label_min is not None:
100 | heading_ranges.append((label_min, label_max))
101 |
102 | heading_ranges = sorted(heading_ranges, reverse=True)
103 |
104 | return heading_ranges
105 |
```
--------------------------------------------------------------------------------
/marker/config/printer.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Optional
2 |
3 | import click
4 |
5 | from marker.config.crawler import crawler
6 |
7 |
8 | class CustomClickPrinter(click.Command):
9 | def parse_args(self, ctx, args):
10 | display_help = "config" in args and "--help" in args
11 | if display_help:
12 | click.echo(
13 | "Here is a list of all the Builders, Processors, Converters, Providers and Renderers in Marker along with their attributes:"
14 | )
15 |
16 | # Keep track of shared attributes and their types
17 | shared_attrs = {}
18 |
19 | # First pass: identify shared attributes and verify compatibility
20 | for base_type, base_type_dict in crawler.class_config_map.items():
21 | for class_name, class_map in base_type_dict.items():
22 | for attr, (attr_type, formatted_type, default, metadata) in class_map[
23 | "config"
24 | ].items():
25 | if attr not in shared_attrs:
26 | shared_attrs[attr] = {
27 | "classes": [],
28 | "type": attr_type,
29 | "is_flag": attr_type in [bool, Optional[bool]]
30 | and not default,
31 | "metadata": metadata,
32 | "default": default,
33 | }
34 | shared_attrs[attr]["classes"].append(class_name)
35 |
36 | # These are the types of attrs that can be set from the command line
37 | attr_types = [
38 | str,
39 | int,
40 | float,
41 | bool,
42 | Optional[int],
43 | Optional[float],
44 | Optional[str],
45 | ]
46 |
47 | # Add shared attribute options first
48 | for attr, info in shared_attrs.items():
49 | if info["type"] in attr_types:
50 | ctx.command.params.append(
51 | click.Option(
52 | ["--" + attr],
53 | type=info["type"],
54 | help=" ".join(info["metadata"])
55 | + f" (Applies to: {', '.join(info['classes'])})",
56 | default=None, # This is important, or it sets all the default keys again in config
57 | is_flag=info["is_flag"],
58 | flag_value=True if info["is_flag"] else None,
59 | )
60 | )
61 |
62 | # Second pass: create class-specific options
63 | for base_type, base_type_dict in crawler.class_config_map.items():
64 | if display_help:
65 | click.echo(f"{base_type}s:")
66 | for class_name, class_map in base_type_dict.items():
67 | if display_help and class_map["config"]:
68 | click.echo(
69 | f"\n {class_name}: {class_map['class_type'].__doc__ or ''}"
70 | )
71 | click.echo(" " * 4 + "Attributes:")
72 | for attr, (attr_type, formatted_type, default, metadata) in class_map[
73 | "config"
74 | ].items():
75 | class_name_attr = class_name + "_" + attr
76 |
77 | if display_help:
78 | click.echo(" " * 8 + f"{attr} ({formatted_type}):")
79 | click.echo(
80 | "\n".join([f"{' ' * 12}" + desc for desc in metadata])
81 | )
82 |
83 | if attr_type in attr_types:
84 | is_flag = attr_type in [bool, Optional[bool]] and not default
85 |
86 | # Only add class-specific options
87 | ctx.command.params.append(
88 | click.Option(
89 | ["--" + class_name_attr, class_name_attr],
90 | type=attr_type,
91 | help=" ".join(metadata),
92 | is_flag=is_flag,
93 | default=None, # This is important, or it sets all the default keys again in config
94 | )
95 | )
96 |
97 | if display_help:
98 | ctx.exit()
99 |
100 | super().parse_args(ctx, args)
101 |
```
--------------------------------------------------------------------------------
/marker/processors/text.py:
--------------------------------------------------------------------------------
```python
1 | import math
2 | from typing import Annotated, List
3 |
4 | import regex
5 |
6 | from marker.processors import BaseProcessor
7 | from marker.schema import BlockTypes
8 | from marker.schema.document import Document
9 | from marker.schema.text.line import Line
10 |
11 |
12 | class TextProcessor(BaseProcessor):
13 | """
14 | A processor for merging text across pages and columns.
15 | """
16 |
17 | block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
18 | ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
19 | column_gap_ratio: Annotated[
20 | float,
21 | "The minimum ratio of the page width to the column gap to consider a column break.",
22 | ] = 0.02
23 |
24 | def __init__(self, config):
25 | super().__init__(config)
26 |
27 | def __call__(self, document: Document):
28 | for page in document.pages:
29 | for block in page.contained_blocks(document, self.block_types):
30 | if block.structure is None:
31 | continue
32 |
33 | if not len(block.structure) >= 2: # Skip single lines
34 | continue
35 |
36 | next_block = document.get_next_block(block, self.ignored_block_types)
37 | if next_block is None: # we've reached the end of the document
38 | continue
39 | if next_block.block_type not in self.block_types:
40 | continue # we found a non-text block
41 | if next_block.structure is None:
42 | continue # This is odd though, why do we have text blocks with no structure?
43 | if next_block.ignore_for_output:
44 | continue # skip ignored blocks
45 |
46 | column_gap = block.polygon.width * self.column_gap_ratio
47 |
48 | column_break, page_break = False, False
49 | next_block_starts_indented = True
50 | next_block_in_first_quadrant = False
51 | last_line_is_full_width = False
52 | last_line_is_hyphentated = False
53 |
54 | if next_block.page_id == block.page_id: # block on the same page
55 | # we check for a column break
56 | column_break = math.floor(next_block.polygon.y_start) <= math.ceil(
57 | block.polygon.y_start
58 | ) and next_block.polygon.x_start > (
59 | block.polygon.x_end + column_gap
60 | )
61 | else:
62 | page_break = True
63 | next_page = document.get_page(next_block.page_id)
64 | next_block_in_first_quadrant = (
65 | next_block.polygon.x_start < next_page.polygon.width // 2
66 | ) and (next_block.polygon.y_start < next_page.polygon.height // 2)
67 |
68 | if not (column_break or page_break):
69 | continue
70 |
71 | new_block_lines = next_block.structure_blocks(document)
72 |
73 | # we check for next_block indentation
74 | if len(new_block_lines):
75 | min_x = math.ceil(
76 | min([line.polygon.x_start for line in new_block_lines])
77 | )
78 | next_block_starts_indented = (
79 | new_block_lines[0].polygon.x_start > min_x
80 | )
81 |
82 | lines: List[Line] = [
83 | line
84 | for line in block.structure_blocks(document)
85 | if line.polygon.width > 1
86 | ]
87 | if len(lines):
88 | max_x = math.floor(max([line.polygon.x_end for line in lines]))
89 | last_line_is_full_width = lines[-1].polygon.x_end >= max_x
90 |
91 | last_line_is_hyphentated = regex.compile(
92 | r".*[\p{Ll}|\d][-—¬]\s?$", regex.DOTALL
93 | ).match(lines[-1].raw_text(document).strip())
94 |
95 | if (
96 | (last_line_is_full_width or last_line_is_hyphentated)
97 | and not next_block_starts_indented
98 | and ((next_block_in_first_quadrant and page_break) or column_break)
99 | ):
100 | block.has_continuation = True
101 |
```
--------------------------------------------------------------------------------
/benchmarks/overall/methods/__init__.py:
--------------------------------------------------------------------------------
```python
1 | import io
2 | import random
3 | import re
4 | from typing import Tuple
5 |
6 | import markdown2
7 | from PIL import Image
8 | from playwright.sync_api import sync_playwright
9 |
10 | from benchmarks.overall.methods.schema import BenchmarkResult
11 | from marker.renderers.markdown import MarkdownRenderer
12 |
13 |
14 | class BaseMethod:
15 | def __init__(self, **kwargs):
16 | for kwarg in kwargs:
17 | if hasattr(self, kwarg):
18 | setattr(self, kwarg, kwargs[kwarg])
19 |
20 | @staticmethod
21 | def convert_to_md(html: str):
22 | md = MarkdownRenderer()
23 | markdown = md.md_cls.convert(html)
24 | return markdown
25 |
26 | def __call__(self, sample) -> BenchmarkResult:
27 | raise NotImplementedError()
28 |
29 | def render(self, markdown: str):
30 | return self.html_to_image(self.convert_to_html(markdown))
31 |
32 | @staticmethod
33 | def convert_to_html(md: str):
34 | block_placeholders = []
35 | inline_placeholders = []
36 |
37 | # Add placeholders for the math
38 | def block_sub(match):
39 | content = match.group(1)
40 | placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
41 | block_placeholders.append((placeholder, f"$${content}$$"))
42 | return placeholder
43 |
44 | def inline_sub(match):
45 | content = match.group(1)
46 | placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
47 | inline_placeholders.append((placeholder, f"${content}$"))
48 | return placeholder
49 |
50 | md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
51 | md = re.sub(r'\$(.*?)\$', inline_sub, md)
52 |
53 | html = markdown2.markdown(md, extras=['tables'])
54 |
55 | # Replace placeholders
56 | for placeholder, math_str in block_placeholders:
57 | html = html.replace(placeholder, math_str)
58 | for placeholder, math_str in inline_placeholders:
59 | html = html.replace(placeholder, math_str)
60 |
61 | return html
62 |
63 | def html_to_image(self, html: str) -> Image.Image:
64 | with sync_playwright() as p:
65 | browser = p.chromium.launch()
66 | page = browser.new_page()
67 | html_str = f"""
68 | <!DOCTYPE html>
69 | <html>
70 | <head>
71 | <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
72 | <!-- The loading of KaTeX is deferred to speed up page rendering -->
73 | <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
74 | <!-- To automatically render math in text elements, include the auto-render extension: -->
75 | <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
76 | </head>
77 | <body>
78 | {html}
79 | <script>
80 | document.addEventListener("DOMContentLoaded", function() {{
81 | renderMathInElement(document.body, {{
82 | delimiters: [
83 | {{left: '$$', right: '$$', display: true}},
84 | {{left: '$', right: '$', display: false}}
85 | ],
86 | throwOnError : false
87 | }});
88 | }});
89 | </script>
90 | </body>
91 | </html>
92 | """.strip()
93 | page.set_viewport_size({"width": 1200, "height": 800})
94 | page.set_content(html_str)
95 | page.wait_for_load_state("domcontentloaded")
96 | page.wait_for_timeout(500) # Wait for KaTeX to render
97 | screenshot_bytes = page.screenshot(full_page=True)
98 | browser.close()
99 |
100 | return Image.open(io.BytesIO(screenshot_bytes))
```
--------------------------------------------------------------------------------
/marker/processors/llm/llm_form.py:
--------------------------------------------------------------------------------
```python
1 | from typing import List
2 |
3 | from pydantic import BaseModel
4 |
5 | from marker.output import json_to_html
6 | from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData
7 |
8 | from marker.schema import BlockTypes
9 | from marker.schema.document import Document
10 |
11 |
12 | class LLMFormProcessor(BaseLLMSimpleBlockProcessor):
13 | block_types = (BlockTypes.Form,)
14 | form_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
15 | You will receive an image of a text block and an html representation of the form in the image.
16 | Your task is to correct any errors in the html representation, and format it properly.
17 | Values and labels should appear in html tables, with the labels on the left side, and values on the right. Other text in the form can appear between the tables. Only use the tags `table, p, span, i, b, th, td, tr, and div`. Do not omit any text from the form - make sure everything is included in the html representation. It should be as faithful to the original form as possible.
18 | **Instructions:**
19 | 1. Carefully examine the provided form block image.
20 | 2. Analyze the html representation of the form.
21 | 3. Compare the html representation to the image.
22 | 4. If the html representation is correct, or you cannot read the image properly, then write "No corrections needed."
23 | 5. If the html representation contains errors, generate the corrected html representation.
24 | 6. Output only either the corrected html representation or "No corrections needed."
25 | **Example:**
26 | Input:
27 | ```html
28 | <table>
29 | <tr>
30 | <td>Label 1</td>
31 | <td>Label 2</td>
32 | <td>Label 3</td>
33 | </tr>
34 | <tr>
35 | <td>Value 1</td>
36 | <td>Value 2</td>
37 | <td>Value 3</td>
38 | </tr>
39 | </table>
40 | ```
41 | Output:
42 | Comparison: The html representation has the labels in the first row and the values in the second row. It should be corrected to have the labels on the left side and the values on the right side.
43 | ```html
44 | <table>
45 | <tr>
46 | <td>Label 1</td>
47 | <td>Value 1</td>
48 | </tr>
49 | <tr>
50 | <td>Label 2</td>
51 | <td>Value 2</td>
52 | </tr>
53 | <tr>
54 | <td>Label 3</td>
55 | <td>Value 3</td>
56 | </tr>
57 | </table>
58 | ```
59 | **Input:**
60 | ```html
61 | {block_html}
62 | ```
63 | """
64 |
65 | def inference_blocks(self, document: Document) -> List[BlockData]:
66 | blocks = super().inference_blocks(document)
67 | out_blocks = []
68 | for block_data in blocks:
69 | block = block_data["block"]
70 | children = block.contained_blocks(document, (BlockTypes.TableCell,))
71 | if not children:
72 | continue
73 | out_blocks.append(block_data)
74 | return out_blocks
75 |
76 |
77 | def block_prompts(self, document: Document) -> List[PromptData]:
78 | prompt_data = []
79 | for block_data in self.inference_blocks(document):
80 | block = block_data["block"]
81 | block_html = json_to_html(block.render(document))
82 | prompt = self.form_rewriting_prompt.replace("{block_html}", block_html)
83 | image = self.extract_image(document, block)
84 | prompt_data.append({
85 | "prompt": prompt,
86 | "image": image,
87 | "block": block,
88 | "schema": FormSchema,
89 | "page": block_data["page"]
90 | })
91 | return prompt_data
92 |
93 |
94 | def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
95 | block = prompt_data["block"]
96 | block_html = json_to_html(block.render(document))
97 |
98 | if not response or "corrected_html" not in response:
99 | block.update_metadata(llm_error_count=1)
100 | return
101 |
102 | corrected_html = response["corrected_html"]
103 |
104 | # The original table is okay
105 | if "no corrections needed" in corrected_html.lower():
106 | return
107 |
108 | # Potentially a partial response
109 | if len(corrected_html) < len(block_html) * .33:
110 | block.update_metadata(llm_error_count=1)
111 | return
112 |
113 | corrected_html = corrected_html.strip().lstrip("```html").rstrip("```").strip()
114 | block.html = corrected_html
115 |
116 | class FormSchema(BaseModel):
117 | comparison: str
118 | corrected_html: str
```
--------------------------------------------------------------------------------
/marker/renderers/ocr_json.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Annotated, List, Tuple
2 |
3 | from pydantic import BaseModel
4 |
5 | from marker.renderers import BaseRenderer
6 | from marker.schema import BlockTypes
7 | from marker.schema.document import Document
8 |
9 |
10 | class OCRJSONCharOutput(BaseModel):
11 | id: str
12 | block_type: str
13 | text: str
14 | polygon: List[List[float]]
15 | bbox: List[float]
16 |
17 |
18 | class OCRJSONLineOutput(BaseModel):
19 | id: str
20 | block_type: str
21 | html: str
22 | polygon: List[List[float]]
23 | bbox: List[float]
24 | children: List["OCRJSONCharOutput"] | None = None
25 |
26 |
27 | class OCRJSONPageOutput(BaseModel):
28 | id: str
29 | block_type: str
30 | polygon: List[List[float]]
31 | bbox: List[float]
32 | children: List[OCRJSONLineOutput] | None = None
33 |
34 |
35 | class OCRJSONOutput(BaseModel):
36 | children: List[OCRJSONPageOutput]
37 | block_type: str = str(BlockTypes.Document)
38 | metadata: dict | None = None
39 |
40 |
41 | class OCRJSONRenderer(BaseRenderer):
42 | """
43 | A renderer for OCR JSON output.
44 | """
45 |
46 | image_blocks: Annotated[
47 | Tuple[BlockTypes],
48 | "The list of block types to consider as images.",
49 | ] = (BlockTypes.Picture, BlockTypes.Figure)
50 | page_blocks: Annotated[
51 | Tuple[BlockTypes],
52 | "The list of block types to consider as pages.",
53 | ] = (BlockTypes.Page,)
54 |
55 | def extract_json(self, document: Document) -> List[OCRJSONPageOutput]:
56 | pages = []
57 | for page in document.pages:
58 | page_equations = [
59 | b for b in page.children if b.block_type == BlockTypes.Equation
60 | and not b.removed
61 | ]
62 | equation_lines = []
63 | for equation in page_equations:
64 | if not equation.structure:
65 | continue
66 |
67 | equation_lines += [
68 | line
69 | for line in equation.structure
70 | if line.block_type == BlockTypes.Line
71 | ]
72 |
73 | page_lines = [
74 | block
75 | for block in page.children
76 | if block.block_type == BlockTypes.Line
77 | and block.id not in equation_lines
78 | and not block.removed
79 | ]
80 |
81 | lines = []
82 | for line in page_lines + page_equations:
83 | line_obj = OCRJSONLineOutput(
84 | id=str(line.id),
85 | block_type=str(line.block_type),
86 | html="",
87 | polygon=line.polygon.polygon,
88 | bbox=line.polygon.bbox,
89 | )
90 | if line in page_equations:
91 | line_obj.html = line.html
92 | else:
93 | line_obj.html = line.formatted_text(document)
94 | spans = (
95 | [document.get_block(span_id) for span_id in line.structure]
96 | if line.structure
97 | else []
98 | )
99 | children = []
100 | for span in spans:
101 | if not span.structure:
102 | continue
103 |
104 | span_chars = [
105 | document.get_block(char_id) for char_id in span.structure
106 | ]
107 | children.extend(
108 | [
109 | OCRJSONCharOutput(
110 | id=str(char.id),
111 | block_type=str(char.block_type),
112 | text=char.text,
113 | polygon=char.polygon.polygon,
114 | bbox=char.polygon.bbox,
115 | )
116 | for char in span_chars
117 | ]
118 | )
119 | line_obj.children = children
120 | lines.append(line_obj)
121 |
122 | page = OCRJSONPageOutput(
123 | id=str(page.id),
124 | block_type=str(page.block_type),
125 | polygon=page.polygon.polygon,
126 | bbox=page.polygon.bbox,
127 | children=lines,
128 | )
129 | pages.append(page)
130 |
131 | return pages
132 |
133 | def __call__(self, document: Document) -> OCRJSONOutput:
134 | return OCRJSONOutput(children=self.extract_json(document), metadata=None)
135 |
```
--------------------------------------------------------------------------------
/marker/services/openai.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | import time
3 | from typing import Annotated, List
4 |
5 | import openai
6 | import PIL
7 | from marker.logger import get_logger
8 | from openai import APITimeoutError, RateLimitError
9 | from PIL import Image
10 | from pydantic import BaseModel
11 |
12 | from marker.schema.blocks import Block
13 | from marker.services import BaseService
14 |
15 | logger = get_logger()
16 |
17 |
18 | class OpenAIService(BaseService):
19 | openai_base_url: Annotated[
20 | str, "The base url to use for OpenAI-like models. No trailing slash."
21 | ] = "https://api.openai.com/v1"
22 | openai_model: Annotated[str, "The model name to use for OpenAI-like model."] = (
23 | "gpt-4o-mini"
24 | )
25 | openai_api_key: Annotated[
26 | str, "The API key to use for the OpenAI-like service."
27 | ] = None
28 | openai_image_format: Annotated[
29 | str,
30 | "The image format to use for the OpenAI-like service. Use 'png' for better compatability",
31 | ] = "webp"
32 |
33 | def process_images(self, images: List[Image.Image]) -> List[dict]:
34 | """
35 | Generate the base-64 encoded message to send to an
36 | openAI-compatabile multimodal model.
37 |
38 | Args:
39 | images: Image or list of PIL images to include
40 | format: Format to use for the image; use "png" for better compatability.
41 |
42 | Returns:
43 | A list of OpenAI-compatbile multimodal messages containing the base64-encoded images.
44 | """
45 | if isinstance(images, Image.Image):
46 | images = [images]
47 |
48 | img_fmt = self.openai_image_format
49 | return [
50 | {
51 | "type": "image_url",
52 | "image_url": {
53 | "url": "data:image/{};base64,{}".format(
54 | img_fmt, self.img_to_base64(img, format=img_fmt)
55 | ),
56 | },
57 | }
58 | for img in images
59 | ]
60 |
61 | def __call__(
62 | self,
63 | prompt: str,
64 | image: PIL.Image.Image | List[PIL.Image.Image] | None,
65 | block: Block | None,
66 | response_schema: type[BaseModel],
67 | max_retries: int | None = None,
68 | timeout: int | None = None,
69 | ):
70 | if max_retries is None:
71 | max_retries = self.max_retries
72 |
73 | if timeout is None:
74 | timeout = self.timeout
75 |
76 | client = self.get_client()
77 | image_data = self.format_image_for_llm(image)
78 |
79 | messages = [
80 | {
81 | "role": "user",
82 | "content": [
83 | *image_data,
84 | {"type": "text", "text": prompt},
85 | ],
86 | }
87 | ]
88 |
89 | total_tries = max_retries + 1
90 | for tries in range(1, total_tries + 1):
91 | try:
92 | response = client.beta.chat.completions.parse(
93 | extra_headers={
94 | "X-Title": "Marker",
95 | "HTTP-Referer": "https://github.com/datalab-to/marker",
96 | },
97 | model=self.openai_model,
98 | messages=messages,
99 | timeout=timeout,
100 | response_format=response_schema,
101 | )
102 | response_text = response.choices[0].message.content
103 | total_tokens = response.usage.total_tokens
104 | if block:
105 | block.update_metadata(
106 | llm_tokens_used=total_tokens, llm_request_count=1
107 | )
108 | return json.loads(response_text)
109 | except (APITimeoutError, RateLimitError) as e:
110 | # Rate limit exceeded
111 | if tries == total_tries:
112 | # Last attempt failed. Give up
113 | logger.error(
114 | f"Rate limit error: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})",
115 | )
116 | break
117 | else:
118 | wait_time = tries * self.retry_wait_time
119 | logger.warning(
120 | f"Rate limit error: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})",
121 | )
122 | time.sleep(wait_time)
123 | except Exception as e:
124 | logger.error(f"OpenAI inference failed: {e}")
125 | break
126 |
127 | return {}
128 |
129 | def get_client(self) -> openai.OpenAI:
130 | return openai.OpenAI(api_key=self.openai_api_key, base_url=self.openai_base_url)
131 |
```