This is page 2 of 9. Use http://codebase.md/datalab-to/marker?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ ├── ISSUE_TEMPLATE │ │ ├── breaking-bug-report.md │ │ ├── feature_request.md │ │ └── output-bug-report.md │ └── workflows │ ├── benchmarks.yml │ ├── ci.yml │ ├── cla.yml │ ├── publish.yml │ └── scripts.yml ├── .gitignore ├── .pre-commit-config.yaml ├── benchmarks │ ├── __init__.py │ ├── overall │ │ ├── __init__.py │ │ ├── display │ │ │ ├── __init__.py │ │ │ ├── dataset.py │ │ │ └── table.py │ │ ├── download │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── llamaparse.py │ │ │ ├── main.py │ │ │ ├── mathpix.py │ │ │ └── mistral.py │ │ ├── elo.py │ │ ├── methods │ │ │ ├── __init__.py │ │ │ ├── docling.py │ │ │ ├── gt.py │ │ │ ├── llamaparse.py │ │ │ ├── marker.py │ │ │ ├── mathpix.py │ │ │ ├── mistral.py │ │ │ ├── olmocr.py │ │ │ └── schema.py │ │ ├── overall.py │ │ ├── registry.py │ │ ├── schema.py │ │ └── scorers │ │ ├── __init__.py │ │ ├── clean.py │ │ ├── heuristic.py │ │ ├── llm.py │ │ └── schema.py │ ├── table │ │ ├── __init__.py │ │ ├── gemini.py │ │ ├── inference.py │ │ ├── scoring.py │ │ └── table.py │ ├── throughput │ │ ├── __init__.py │ │ └── main.py │ └── verify_scores.py ├── chunk_convert.py ├── CLA.md ├── convert_single.py ├── convert.py ├── data │ ├── .gitignore │ ├── examples │ │ ├── json │ │ │ ├── multicolcnn.json │ │ │ ├── switch_trans.json │ │ │ └── thinkpython.json │ │ └── markdown │ │ ├── multicolcnn │ │ │ ├── _page_1_Figure_0.jpeg │ │ │ ├── _page_2_Picture_0.jpeg │ │ │ ├── _page_6_Figure_0.jpeg │ │ │ ├── _page_7_Figure_0.jpeg │ │ │ ├── multicolcnn_meta.json │ │ │ └── multicolcnn.md │ │ ├── switch_transformers │ │ │ ├── _page_11_Figure_4.jpeg │ │ │ ├── _page_12_Figure_4.jpeg │ │ │ ├── _page_13_Figure_2.jpeg │ │ │ ├── _page_18_Figure_1.jpeg │ │ │ ├── _page_18_Figure_3.jpeg │ │ │ ├── _page_2_Figure_3.jpeg │ │ │ ├── _page_20_Figure_1.jpeg │ │ │ ├── _page_20_Figure_4.jpeg │ │ │ ├── _page_27_Figure_1.jpeg │ │ │ ├── _page_29_Figure_1.jpeg │ │ │ ├── _page_30_Figure_1.jpeg │ │ │ ├── _page_31_Figure_3.jpeg │ │ │ ├── _page_4_Figure_1.jpeg │ │ │ ├── _page_5_Figure_3.jpeg │ │ │ ├── switch_trans_meta.json │ │ │ └── switch_trans.md │ │ └── thinkpython │ │ ├── _page_109_Figure_1.jpeg │ │ ├── _page_115_Figure_1.jpeg │ │ ├── _page_116_Figure_3.jpeg │ │ ├── _page_127_Figure_1.jpeg │ │ ├── _page_128_Figure_1.jpeg │ │ ├── _page_167_Figure_1.jpeg │ │ ├── _page_169_Figure_1.jpeg │ │ ├── _page_173_Figure_1.jpeg │ │ ├── _page_190_Figure_1.jpeg │ │ ├── _page_195_Figure_1.jpeg │ │ ├── _page_205_Figure_1.jpeg │ │ ├── _page_23_Figure_1.jpeg │ │ ├── _page_23_Figure_3.jpeg │ │ ├── _page_230_Figure_1.jpeg │ │ ├── _page_233_Figure_1.jpeg │ │ ├── _page_233_Figure_3.jpeg │ │ ├── _page_234_Figure_1.jpeg │ │ ├── _page_235_Figure_1.jpeg │ │ ├── _page_236_Figure_1.jpeg │ │ ├── _page_236_Figure_3.jpeg │ │ ├── _page_237_Figure_1.jpeg │ │ ├── _page_238_Figure_1.jpeg │ │ ├── _page_46_Figure_1.jpeg │ │ ├── _page_60_Figure_1.jpeg │ │ ├── _page_60_Figure_3.jpeg │ │ ├── _page_67_Figure_1.jpeg │ │ ├── _page_71_Figure_1.jpeg │ │ ├── _page_78_Figure_1.jpeg │ │ ├── _page_85_Figure_1.jpeg │ │ ├── _page_94_Figure_1.jpeg │ │ ├── _page_99_Figure_17.jpeg │ │ ├── _page_99_Figure_178.jpeg │ │ ├── thinkpython_meta.json │ │ └── thinkpython.md │ ├── images │ │ ├── overall.png │ │ ├── per_doc.png │ │ └── table.png │ └── latex_to_md.sh ├── examples │ ├── marker_modal_deployment.py │ └── README.md ├── extraction_app.py ├── LICENSE ├── marker │ ├── builders │ │ ├── __init__.py │ │ ├── document.py │ │ ├── layout.py │ │ ├── line.py │ │ ├── ocr.py │ │ └── structure.py │ ├── config │ │ ├── __init__.py │ │ ├── crawler.py │ │ ├── parser.py │ │ └── printer.py │ ├── converters │ │ ├── __init__.py │ │ ├── extraction.py │ │ ├── ocr.py │ │ ├── pdf.py │ │ └── table.py │ ├── extractors │ │ ├── __init__.py │ │ ├── document.py │ │ └── page.py │ ├── logger.py │ ├── models.py │ ├── output.py │ ├── processors │ │ ├── __init__.py │ │ ├── blank_page.py │ │ ├── block_relabel.py │ │ ├── blockquote.py │ │ ├── code.py │ │ ├── debug.py │ │ ├── document_toc.py │ │ ├── equation.py │ │ ├── footnote.py │ │ ├── ignoretext.py │ │ ├── line_merge.py │ │ ├── line_numbers.py │ │ ├── list.py │ │ ├── llm │ │ │ ├── __init__.py │ │ │ ├── llm_complex.py │ │ │ ├── llm_equation.py │ │ │ ├── llm_form.py │ │ │ ├── llm_handwriting.py │ │ │ ├── llm_image_description.py │ │ │ ├── llm_mathblock.py │ │ │ ├── llm_meta.py │ │ │ ├── llm_page_correction.py │ │ │ ├── llm_sectionheader.py │ │ │ ├── llm_table_merge.py │ │ │ └── llm_table.py │ │ ├── order.py │ │ ├── page_header.py │ │ ├── reference.py │ │ ├── sectionheader.py │ │ ├── table.py │ │ ├── text.py │ │ └── util.py │ ├── providers │ │ ├── __init__.py │ │ ├── document.py │ │ ├── epub.py │ │ ├── html.py │ │ ├── image.py │ │ ├── pdf.py │ │ ├── powerpoint.py │ │ ├── registry.py │ │ ├── spreadsheet.py │ │ └── utils.py │ ├── renderers │ │ ├── __init__.py │ │ ├── chunk.py │ │ ├── extraction.py │ │ ├── html.py │ │ ├── json.py │ │ ├── markdown.py │ │ └── ocr_json.py │ ├── schema │ │ ├── __init__.py │ │ ├── blocks │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── basetable.py │ │ │ ├── caption.py │ │ │ ├── code.py │ │ │ ├── complexregion.py │ │ │ ├── equation.py │ │ │ ├── figure.py │ │ │ ├── footnote.py │ │ │ ├── form.py │ │ │ ├── handwriting.py │ │ │ ├── inlinemath.py │ │ │ ├── listitem.py │ │ │ ├── pagefooter.py │ │ │ ├── pageheader.py │ │ │ ├── picture.py │ │ │ ├── reference.py │ │ │ ├── sectionheader.py │ │ │ ├── table.py │ │ │ ├── tablecell.py │ │ │ ├── text.py │ │ │ └── toc.py │ │ ├── document.py │ │ ├── groups │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── figure.py │ │ │ ├── list.py │ │ │ ├── page.py │ │ │ ├── picture.py │ │ │ └── table.py │ │ ├── polygon.py │ │ ├── registry.py │ │ └── text │ │ ├── __init__.py │ │ ├── char.py │ │ ├── line.py │ │ └── span.py │ ├── scripts │ │ ├── __init__.py │ │ ├── chunk_convert.py │ │ ├── chunk_convert.sh │ │ ├── common.py │ │ ├── convert_single.py │ │ ├── convert.py │ │ ├── extraction_app.py │ │ ├── file_to_s3.py │ │ ├── run_streamlit_app.py │ │ ├── server.py │ │ └── streamlit_app.py │ ├── services │ │ ├── __init__.py │ │ ├── azure_openai.py │ │ ├── claude.py │ │ ├── gemini.py │ │ ├── ollama.py │ │ ├── openai.py │ │ └── vertex.py │ ├── settings.py │ ├── util.py │ └── utils │ ├── __init__.py │ ├── batch.py │ ├── gpu.py │ └── image.py ├── marker_app.py ├── marker_server.py ├── poetry.lock ├── pyproject.toml ├── pytest.ini ├── README.md ├── signatures │ └── version1 │ └── cla.json ├── static │ └── fonts │ └── .gitignore └── tests ├── builders │ ├── test_blank_page.py │ ├── test_document_builder.py │ ├── test_garbled_pdf.py │ ├── test_layout_replace.py │ ├── test_ocr_builder.py │ ├── test_ocr_pipeline.py │ ├── test_overriding.py │ ├── test_pdf_links.py │ ├── test_rotated_bboxes.py │ ├── test_strip_existing_ocr.py │ └── test_structure.py ├── config │ └── test_config.py ├── conftest.py ├── converters │ ├── test_extraction_converter.py │ ├── test_ocr_converter.py │ ├── test_pdf_converter.py │ └── test_table_converter.py ├── processors │ ├── test_document_toc_processor.py │ ├── test_equation_processor.py │ ├── test_footnote_processor.py │ ├── test_ignoretext.py │ ├── test_llm_processors.py │ ├── test_table_merge.py │ └── test_table_processor.py ├── providers │ ├── test_document_providers.py │ ├── test_image_provider.py │ └── test_pdf_provider.py ├── renderers │ ├── test_chunk_renderer.py │ ├── test_extract_images.py │ ├── test_html_renderer.py │ ├── test_json_renderer.py │ └── test_markdown_renderer.py ├── schema │ └── groups │ └── test_list_grouping.py ├── services │ └── test_service_init.py └── utils.py ``` # Files -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- ```toml 1 | [tool.poetry] 2 | name = "marker-pdf" 3 | version = "1.10.1" 4 | description = "Convert documents to markdown with high speed and accuracy." 5 | authors = ["Vik Paruchuri <[email protected]>"] 6 | readme = "README.md" 7 | license = "GPL-3.0-or-later" 8 | repository = "https://github.com/VikParuchuri/marker" 9 | keywords = ["pdf", "markdown", "ocr", "nlp"] 10 | packages = [ 11 | {include = "marker"} 12 | ] 13 | include = [ 14 | "marker/scripts/*.sh", 15 | "marker/scripts/*.html", 16 | ] 17 | 18 | [tool.poetry.dependencies] 19 | python = "^3.10" 20 | Pillow = "^10.1.0" 21 | pydantic = "^2.4.2" 22 | pydantic-settings = "^2.0.3" 23 | transformers = "^4.45.2" 24 | python-dotenv = "^1.0.0" 25 | torch = "^2.7.0" 26 | tqdm = "^4.66.1" 27 | ftfy = "^6.1.1" 28 | rapidfuzz = "^3.8.1" 29 | surya-ocr = "^0.17.0" 30 | regex = "^2024.4.28" 31 | pdftext = "~0.6.3" 32 | markdownify = "^1.1.0" 33 | click = "^8.2.0" 34 | markdown2 = "^2.5.2" 35 | filetype = "^1.2.0" 36 | google-genai = "^1.0.0" 37 | anthropic = "^0.46.0" 38 | pre-commit = "^4.2.0" 39 | scikit-learn = "^1.6.1" 40 | 41 | # Optional dependencies for documents 42 | mammoth = {version = "^1.9.0", optional = true} 43 | openpyxl = {version = "^3.1.5", optional = true} 44 | python-pptx = {version = "^1.0.2", optional = true} 45 | ebooklib = {version = "^0.18", optional = true} 46 | weasyprint = {version = "^63.1", optional = true} 47 | openai = "^1.65.2" 48 | 49 | [tool.poetry.group.dev.dependencies] 50 | jupyter = "^1.0.0" 51 | datasets = "^2.21.0" 52 | streamlit = "^1.37.1" 53 | fastapi = "^0.115.4" 54 | uvicorn = "^0.32.0" 55 | python-multipart = "^0.0.16" 56 | pytest = "^8.3.3" 57 | pytest-mock = "^3.14.0" 58 | apted = "1.0.3" 59 | distance = "0.1.3" 60 | lxml = "5.3.0" 61 | tabulate = "^0.9.0" 62 | latex2mathml = "^3.77.0" 63 | playwright = "^1.49.1" 64 | 65 | [tool.poetry.extras] 66 | full = ["mammoth", "openpyxl", "python-pptx", "ebooklib", "weasyprint"] 67 | 68 | [tool.poetry.scripts] 69 | marker = "marker.scripts.convert:convert_cli" 70 | marker_single = "marker.scripts.convert_single:convert_single_cli" 71 | marker_chunk_convert = "marker.scripts.chunk_convert:chunk_convert_cli" 72 | marker_gui = "marker.scripts.run_streamlit_app:streamlit_app_cli" 73 | marker_extract = "marker.scripts.run_streamlit_app:extraction_app_cli" 74 | marker_server = "marker.scripts.server:server_cli" 75 | 76 | [build-system] 77 | requires = ["poetry-core"] 78 | build-backend = "poetry.core.masonry.api" 79 | ``` -------------------------------------------------------------------------------- /marker/converters/table.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Tuple, List 2 | 3 | from marker.builders.document import DocumentBuilder 4 | from marker.builders.line import LineBuilder 5 | from marker.builders.ocr import OcrBuilder 6 | from marker.converters.pdf import PdfConverter 7 | from marker.processors import BaseProcessor 8 | from marker.processors.llm.llm_complex import LLMComplexRegionProcessor 9 | from marker.processors.llm.llm_form import LLMFormProcessor 10 | from marker.processors.llm.llm_table import LLMTableProcessor 11 | from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor 12 | from marker.processors.table import TableProcessor 13 | from marker.providers.registry import provider_from_filepath 14 | from marker.schema import BlockTypes 15 | 16 | 17 | class TableConverter(PdfConverter): 18 | default_processors: Tuple[BaseProcessor, ...] = ( 19 | TableProcessor, 20 | LLMTableProcessor, 21 | LLMTableMergeProcessor, 22 | LLMFormProcessor, 23 | LLMComplexRegionProcessor, 24 | ) 25 | converter_block_types: List[BlockTypes] = ( 26 | BlockTypes.Table, 27 | BlockTypes.Form, 28 | BlockTypes.TableOfContents, 29 | ) 30 | 31 | def build_document(self, filepath: str): 32 | provider_cls = provider_from_filepath(filepath) 33 | layout_builder = self.resolve_dependencies(self.layout_builder_class) 34 | line_builder = self.resolve_dependencies(LineBuilder) 35 | ocr_builder = self.resolve_dependencies(OcrBuilder) 36 | document_builder = DocumentBuilder(self.config) 37 | document_builder.disable_ocr = True 38 | 39 | provider = provider_cls(filepath, self.config) 40 | document = document_builder(provider, layout_builder, line_builder, ocr_builder) 41 | 42 | for page in document.pages: 43 | page.structure = [ 44 | p for p in page.structure if p.block_type in self.converter_block_types 45 | ] 46 | 47 | for processor in self.processor_list: 48 | processor(document) 49 | 50 | return document 51 | 52 | def __call__(self, filepath: str): 53 | document = self.build_document(filepath) 54 | self.page_count = len(document.pages) 55 | 56 | renderer = self.resolve_dependencies(self.renderer) 57 | return renderer(document) 58 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/basetable.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.blocks import Block, BlockOutput 5 | from marker.schema.blocks.tablecell import TableCell 6 | 7 | 8 | class BaseTable(Block): 9 | block_type: BlockTypes | None = None 10 | html: str | None = None 11 | 12 | @staticmethod 13 | def format_cells( 14 | document, child_blocks, block_config, child_cells: List[TableCell] | None = None 15 | ): 16 | if child_cells is None: 17 | child_cells: List[TableCell] = [ 18 | document.get_block(c.id) 19 | for c in child_blocks 20 | if c.id.block_type == BlockTypes.TableCell 21 | ] 22 | 23 | unique_rows = sorted(list(set([c.row_id for c in child_cells]))) 24 | html_repr = "<table><tbody>" 25 | for row_id in unique_rows: 26 | row_cells = sorted( 27 | [c for c in child_cells if c.row_id == row_id], key=lambda x: x.col_id 28 | ) 29 | html_repr += "<tr>" 30 | for cell in row_cells: 31 | html_repr += cell.assemble_html( 32 | document, child_blocks, None, block_config 33 | ) 34 | html_repr += "</tr>" 35 | html_repr += "</tbody></table>" 36 | return html_repr 37 | 38 | def assemble_html( 39 | self, 40 | document, 41 | child_blocks: List[BlockOutput], 42 | parent_structure=None, 43 | block_config: dict | None = None, 44 | ): 45 | # Filter out the table cells, so they don't render twice 46 | child_ref_blocks = [ 47 | block 48 | for block in child_blocks 49 | if block.id.block_type == BlockTypes.Reference 50 | ] 51 | template = super().assemble_html( 52 | document, child_ref_blocks, parent_structure, block_config 53 | ) 54 | 55 | child_block_types = set([c.id.block_type for c in child_blocks]) 56 | if self.html: 57 | # LLM processor 58 | return template + self.html 59 | elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types: 60 | # Table processor 61 | return template + self.format_cells(document, child_blocks, block_config) 62 | else: 63 | # Default text lines and spans 64 | return f"<p>{template}</p>" 65 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/download/mathpix.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import time 3 | 4 | import requests 5 | 6 | from benchmarks.overall.download.base import Downloader 7 | 8 | 9 | class MathpixDownloader(Downloader): 10 | service = "mathpix" 11 | 12 | def get_html(self, pdf_bytes): 13 | headers = { 14 | "app_id": self.app_id, 15 | "app_key": self.api_key, 16 | } 17 | start = time.time() 18 | pdf_id = mathpix_request(pdf_bytes, headers) 19 | status = mathpix_status(pdf_id, headers) 20 | if status in ["processing", "error"]: 21 | md = "" 22 | else: 23 | md = mathpix_results(pdf_id, headers) 24 | end = time.time() 25 | if isinstance(md, bytes): 26 | md = md.decode("utf-8") 27 | 28 | return { 29 | "md": md, 30 | "time": end - start 31 | } 32 | 33 | def mathpix_request(buffer, headers): 34 | response = requests.post("https://api.mathpix.com/v3/pdf", 35 | headers=headers, 36 | data={ 37 | "options_json": json.dumps( 38 | { 39 | "conversion_formats": { 40 | "md": True, 41 | "html": True 42 | } 43 | } 44 | ) 45 | }, 46 | files={ 47 | "file": buffer 48 | } 49 | ) 50 | data = response.json() 51 | pdf_id = data["pdf_id"] 52 | return pdf_id 53 | 54 | def mathpix_status(pdf_id, headers): 55 | max_iters = 120 56 | i = 0 57 | status = "processing" 58 | status2 = "processing" 59 | while i < max_iters: 60 | time.sleep(1) 61 | response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}", 62 | headers=headers 63 | ) 64 | status_resp = response.json() 65 | if "conversion_status" not in status_resp: 66 | continue 67 | status = status_resp["conversion_status"]["md"]["status"] 68 | status2 = status_resp["conversion_status"]["html"]["status"] 69 | if status == "completed" and status2 == "completed": 70 | break 71 | elif status == "error" or status2 == "error": 72 | break 73 | out_status = "completed" if status == "completed" and status2 == "completed" else "error" 74 | return out_status 75 | 76 | def mathpix_results(pdf_id, headers, ext="md"): 77 | response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}.{ext}", 78 | headers=headers 79 | ) 80 | return response.content 81 | ``` -------------------------------------------------------------------------------- /tests/builders/test_garbled_pdf.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.builders.document import DocumentBuilder 4 | from marker.builders.line import LineBuilder 5 | from marker.processors.table import TableProcessor 6 | from marker.schema import BlockTypes 7 | 8 | 9 | @pytest.mark.filename("water_damage.pdf") 10 | def test_garbled_pdf(pdf_document, recognition_model, table_rec_model, detection_model): 11 | assert pdf_document.pages[0].structure[0] == "/page/0/Table/0" 12 | 13 | table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0]) 14 | assert table_block.block_type == BlockTypes.Table 15 | assert table_block.structure[0] == "/page/0/Line/10" 16 | 17 | table_cell = pdf_document.pages[0].get_block(table_block.structure[0]) 18 | assert table_cell.block_type == BlockTypes.Line 19 | 20 | # We don't OCR in the initial pass, only with the TableProcessor 21 | processor = TableProcessor(recognition_model, table_rec_model, detection_model) 22 | processor(pdf_document) 23 | 24 | table = pdf_document.pages[0].contained_blocks(pdf_document, (BlockTypes.Table,))[0] 25 | assert "варіант" in table.raw_text(pdf_document) 26 | 27 | table_cell = pdf_document.pages[0].get_block(table_block.structure[0]) 28 | assert table_cell.block_type == BlockTypes.TableCell 29 | 30 | 31 | @pytest.mark.filename("hindi_judgement.pdf") 32 | @pytest.mark.config({"page_range": [2, 3], "disable_ocr": True}) 33 | def test_garbled_builder(config, doc_provider, detection_model, ocr_error_model): 34 | line_builder = LineBuilder(detection_model, ocr_error_model, config) 35 | builder = DocumentBuilder(config) 36 | document = builder.build_document(doc_provider) 37 | 38 | bad_ocr_results = line_builder.ocr_error_detection( 39 | document.pages, doc_provider.page_lines 40 | ) 41 | assert len(bad_ocr_results.labels) == 2 42 | assert any([label == "bad" for label in bad_ocr_results.labels]) 43 | 44 | 45 | @pytest.mark.filename("adversarial.pdf") 46 | @pytest.mark.config({"page_range": [2, 3], "disable_ocr": True}) 47 | def test_nongarbled_builder(config, doc_provider, detection_model, ocr_error_model): 48 | line_builder = LineBuilder(detection_model, ocr_error_model, config) 49 | builder = DocumentBuilder(config) 50 | document = builder.build_document(doc_provider) 51 | 52 | bad_ocr_results = line_builder.ocr_error_detection( 53 | document.pages, doc_provider.page_lines 54 | ) 55 | assert len(bad_ocr_results.labels) == 2 56 | assert all([label == "good" for label in bad_ocr_results.labels]) 57 | ``` -------------------------------------------------------------------------------- /marker/processors/order.py: -------------------------------------------------------------------------------- ```python 1 | from statistics import mean 2 | from collections import defaultdict 3 | 4 | from marker.processors import BaseProcessor 5 | from marker.schema import BlockTypes 6 | from marker.schema.document import Document 7 | 8 | 9 | class OrderProcessor(BaseProcessor): 10 | """ 11 | A processor for sorting the blocks in order if needed. This can help when the layout image was sliced. 12 | """ 13 | block_types = tuple() 14 | 15 | def __call__(self, document: Document): 16 | for page in document.pages: 17 | # Skip OCRed pages 18 | if page.text_extraction_method != "pdftext": 19 | continue 20 | 21 | # Skip pages without layout slicing 22 | if not page.layout_sliced: 23 | continue 24 | 25 | block_idxs = defaultdict(int) 26 | for block_id in page.structure: 27 | block = document.get_block(block_id) 28 | spans = block.contained_blocks(document, (BlockTypes.Span, )) 29 | if len(spans) == 0: 30 | continue 31 | 32 | # Avg span position in original PDF 33 | block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2 34 | 35 | for block_id in page.structure: 36 | # Already assigned block id via span position 37 | if block_idxs[block_id] > 0: 38 | continue 39 | 40 | block = document.get_block(block_id) 41 | prev_block = document.get_prev_block(block) 42 | next_block = document.get_next_block(block) 43 | 44 | block_idx_add = 0 45 | if prev_block: 46 | block_idx_add = 1 47 | 48 | while prev_block and prev_block.id not in block_idxs: 49 | prev_block = document.get_prev_block(prev_block) 50 | block_idx_add += 1 51 | 52 | if not prev_block: 53 | block_idx_add = -1 54 | while next_block and next_block.id not in block_idxs: 55 | next_block = document.get_next_block(next_block) 56 | block_idx_add -= 1 57 | 58 | if not next_block and not prev_block: 59 | pass 60 | elif prev_block: 61 | block_idxs[block_id] = block_idxs[prev_block.id] + block_idx_add 62 | else: 63 | block_idxs[block_id] = block_idxs[next_block.id] + block_idx_add 64 | 65 | page.structure = sorted(page.structure, key=lambda x: block_idxs[x]) 66 | 67 | ``` -------------------------------------------------------------------------------- /tests/converters/test_extraction_converter.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import pytest 3 | 4 | from marker.converters.extraction import ExtractionConverter 5 | from marker.extractors.page import PageExtractionSchema 6 | from marker.extractors.document import DocumentExtractionSchema 7 | from marker.services import BaseService 8 | 9 | 10 | class MockLLMService(BaseService): 11 | def __call__(self, prompt, image=None, page=None, response_schema=None, **kwargs): 12 | if response_schema == PageExtractionSchema: 13 | return { 14 | "description": "Mock extraction description", 15 | "detailed_notes": "Mock detailed notes for page extraction", 16 | } 17 | elif response_schema == DocumentExtractionSchema: 18 | return { 19 | "analysis": "Mock document analysis", 20 | "document_json": json.dumps({"test_key": "test_value"}), 21 | } 22 | return {} 23 | 24 | 25 | @pytest.fixture 26 | def mock_llm_service(): 27 | return MockLLMService 28 | 29 | 30 | @pytest.fixture 31 | def extraction_converter(config, model_dict, mock_llm_service): 32 | test_schema = { 33 | "title": "TestSchema", 34 | "type": "object", 35 | "properties": {"test_key": {"title": "Test Key", "type": "string"}}, 36 | "required": ["test_key"], 37 | } 38 | 39 | config["page_schema"] = json.dumps(test_schema) 40 | config["output_format"] = "markdown" 41 | model_dict["llm_service"] = mock_llm_service 42 | 43 | converter = ExtractionConverter( 44 | artifact_dict=model_dict, processor_list=None, config=config 45 | ) 46 | converter.llm_service = mock_llm_service 47 | converter.default_llm_service = MockLLMService 48 | return converter 49 | 50 | 51 | @pytest.mark.config({"page_range": [0]}) 52 | def test_extraction_converter(config, model_dict, mock_llm_service, temp_doc): 53 | config["page_schema"] = "invalid json" 54 | 55 | model_dict["llm_service"] = mock_llm_service 56 | converter = ExtractionConverter( 57 | artifact_dict=model_dict, processor_list=None, config=config 58 | ) 59 | converter.artifact_dict["llm_service"] = mock_llm_service() 60 | 61 | results = converter(temp_doc.name) 62 | assert results.document_json == '{"test_key": "test_value"}' 63 | 64 | 65 | @pytest.mark.config({"page_range": [0, 1]}) 66 | def test_extraction_converter_multiple_pages(extraction_converter, temp_doc): 67 | result = extraction_converter(temp_doc.name) 68 | 69 | assert result is not None 70 | assert result.document_json is not None 71 | assert json.loads(result.document_json) == {"test_key": "test_value"} 72 | assert result.analysis == "Mock document analysis" 73 | ``` -------------------------------------------------------------------------------- /marker/processors/llm/llm_meta.py: -------------------------------------------------------------------------------- ```python 1 | from concurrent.futures import ThreadPoolExecutor 2 | from typing import List, Dict, Any 3 | 4 | from marker.logger import get_logger 5 | from tqdm import tqdm 6 | 7 | from marker.processors.llm import BaseLLMSimpleBlockProcessor, BaseLLMProcessor 8 | from marker.schema.document import Document 9 | from marker.services import BaseService 10 | 11 | logger = get_logger() 12 | 13 | 14 | class LLMSimpleBlockMetaProcessor(BaseLLMProcessor): 15 | """ 16 | A wrapper for simple LLM processors, so they can all run in parallel. 17 | """ 18 | 19 | def __init__( 20 | self, 21 | processor_lst: List[BaseLLMSimpleBlockProcessor], 22 | llm_service: BaseService, 23 | config=None, 24 | ): 25 | super().__init__(llm_service, config) 26 | self.processors = processor_lst 27 | 28 | def __call__(self, document: Document): 29 | if not self.use_llm or self.llm_service is None: 30 | return 31 | 32 | total = sum( 33 | [len(processor.inference_blocks(document)) for processor in self.processors] 34 | ) 35 | pbar = tqdm( 36 | desc="LLM processors running", disable=self.disable_tqdm, total=total 37 | ) 38 | 39 | all_prompts = [ 40 | processor.block_prompts(document) for processor in self.processors 41 | ] 42 | pending = [] 43 | futures_map = {} 44 | with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor: 45 | for i, prompt_lst in enumerate(all_prompts): 46 | for prompt in prompt_lst: 47 | future = executor.submit(self.get_response, prompt) 48 | pending.append(future) 49 | futures_map[future] = {"processor_idx": i, "prompt_data": prompt} 50 | 51 | for future in pending: 52 | try: 53 | result = future.result() 54 | future_data = futures_map.pop(future) 55 | processor: BaseLLMSimpleBlockProcessor = self.processors[ 56 | future_data["processor_idx"] 57 | ] 58 | # finalize the result 59 | processor(result, future_data["prompt_data"], document) 60 | except Exception as e: 61 | logger.warning(f"Error processing LLM response: {e}") 62 | 63 | pbar.update(1) 64 | 65 | pbar.close() 66 | 67 | def get_response(self, prompt_data: Dict[str, Any]): 68 | return self.llm_service( 69 | prompt_data["prompt"], 70 | prompt_data["image"], 71 | prompt_data["block"], 72 | prompt_data["schema"], 73 | ) 74 | ``` -------------------------------------------------------------------------------- /marker/converters/__init__.py: -------------------------------------------------------------------------------- ```python 1 | import inspect 2 | from typing import Optional, List, Type 3 | 4 | from pydantic import BaseModel 5 | 6 | from marker.processors import BaseProcessor 7 | from marker.processors.llm import BaseLLMSimpleBlockProcessor 8 | from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor 9 | from marker.util import assign_config, download_font 10 | 11 | 12 | class BaseConverter: 13 | def __init__(self, config: Optional[BaseModel | dict] = None): 14 | assign_config(self, config) 15 | self.config = config 16 | self.llm_service = None 17 | 18 | # Download render font, needed for some providers 19 | download_font() 20 | 21 | def __call__(self, *args, **kwargs): 22 | raise NotImplementedError 23 | 24 | def resolve_dependencies(self, cls): 25 | init_signature = inspect.signature(cls.__init__) 26 | parameters = init_signature.parameters 27 | 28 | resolved_kwargs = {} 29 | for param_name, param in parameters.items(): 30 | if param_name == 'self': 31 | continue 32 | elif param_name == 'config': 33 | resolved_kwargs[param_name] = self.config 34 | elif param.name in self.artifact_dict: 35 | resolved_kwargs[param_name] = self.artifact_dict[param_name] 36 | elif param.default != inspect.Parameter.empty: 37 | resolved_kwargs[param_name] = param.default 38 | else: 39 | raise ValueError(f"Cannot resolve dependency for parameter: {param_name}") 40 | 41 | return cls(**resolved_kwargs) 42 | 43 | def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) -> List[BaseProcessor]: 44 | processors = [] 45 | for processor_cls in processor_cls_lst: 46 | processors.append(self.resolve_dependencies(processor_cls)) 47 | 48 | simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)] 49 | other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)] 50 | 51 | if not simple_llm_processors: 52 | return processors 53 | 54 | llm_positions = [i for i, p in enumerate(processors) if issubclass(type(p), BaseLLMSimpleBlockProcessor)] 55 | insert_position = max(0, llm_positions[-1] - len(simple_llm_processors) + 1) 56 | 57 | meta_processor = LLMSimpleBlockMetaProcessor( 58 | processor_lst=simple_llm_processors, 59 | llm_service=self.llm_service, 60 | config=self.config, 61 | ) 62 | other_processors.insert(insert_position, meta_processor) 63 | return other_processors ``` -------------------------------------------------------------------------------- /marker/providers/registry.py: -------------------------------------------------------------------------------- ```python 1 | import filetype 2 | import filetype.match as file_match 3 | from bs4 import BeautifulSoup 4 | from filetype.types import archive, document, IMAGE 5 | 6 | from marker.providers.document import DocumentProvider 7 | from marker.providers.epub import EpubProvider 8 | from marker.providers.html import HTMLProvider 9 | from marker.providers.image import ImageProvider 10 | from marker.providers.pdf import PdfProvider 11 | from marker.providers.powerpoint import PowerPointProvider 12 | from marker.providers.spreadsheet import SpreadSheetProvider 13 | 14 | DOCTYPE_MATCHERS = { 15 | "image": IMAGE, 16 | "pdf": [ 17 | archive.Pdf, 18 | ], 19 | "epub": [ 20 | archive.Epub, 21 | ], 22 | "doc": [document.Docx], 23 | "xls": [document.Xlsx], 24 | "ppt": [document.Pptx], 25 | } 26 | 27 | 28 | def load_matchers(doctype: str): 29 | return [cls() for cls in DOCTYPE_MATCHERS[doctype]] 30 | 31 | 32 | def load_extensions(doctype: str): 33 | return [cls.EXTENSION for cls in DOCTYPE_MATCHERS[doctype]] 34 | 35 | 36 | def provider_from_ext(filepath: str): 37 | ext = filepath.rsplit(".", 1)[-1].strip() 38 | if not ext: 39 | return PdfProvider 40 | 41 | if ext in load_extensions("image"): 42 | return ImageProvider 43 | if ext in load_extensions("pdf"): 44 | return PdfProvider 45 | if ext in load_extensions("doc"): 46 | return DocumentProvider 47 | if ext in load_extensions("xls"): 48 | return SpreadSheetProvider 49 | if ext in load_extensions("ppt"): 50 | return PowerPointProvider 51 | if ext in load_extensions("epub"): 52 | return EpubProvider 53 | if ext in ["html"]: 54 | return HTMLProvider 55 | 56 | return PdfProvider 57 | 58 | 59 | def provider_from_filepath(filepath: str): 60 | if filetype.image_match(filepath) is not None: 61 | return ImageProvider 62 | if file_match(filepath, load_matchers("pdf")) is not None: 63 | return PdfProvider 64 | if file_match(filepath, load_matchers("epub")) is not None: 65 | return EpubProvider 66 | if file_match(filepath, load_matchers("doc")) is not None: 67 | return DocumentProvider 68 | if file_match(filepath, load_matchers("xls")) is not None: 69 | return SpreadSheetProvider 70 | if file_match(filepath, load_matchers("ppt")) is not None: 71 | return PowerPointProvider 72 | 73 | try: 74 | with open(filepath, "r", encoding="utf-8") as f: 75 | soup = BeautifulSoup(f.read(), "html.parser") 76 | # Check if there are any HTML tags 77 | if bool(soup.find()): 78 | return HTMLProvider 79 | except Exception: 80 | pass 81 | 82 | # Fallback if we incorrectly detect the file type 83 | return provider_from_ext(filepath) 84 | ``` -------------------------------------------------------------------------------- /marker/processors/blank_page.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Annotated 2 | 3 | from PIL import Image 4 | import numpy as np 5 | import cv2 6 | 7 | from marker.processors import BaseProcessor 8 | from marker.schema import BlockTypes 9 | from marker.schema.blocks import Block 10 | from marker.schema.document import Document 11 | 12 | from marker.logger import get_logger 13 | 14 | logger = get_logger() 15 | 16 | 17 | class BlankPageProcessor(BaseProcessor): 18 | """ 19 | A processor to filter out blank pages detected as a single layout block 20 | """ 21 | 22 | full_page_block_intersection_threshold: Annotated[ 23 | float, "Threshold to detect blank pages at" 24 | ] = 0.8 25 | filter_blank_pages: Annotated[bool, "Remove blank pages detected as images."] = ( 26 | False 27 | ) 28 | 29 | def is_blank(self, image: Image.Image): 30 | image = np.asarray(image) 31 | if image.size == 0 or image.shape[0] == 0 or image.shape[1] == 0: 32 | # Handle empty image case 33 | return True 34 | 35 | gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) 36 | gray = cv2.GaussianBlur(gray, (7, 7), 0) 37 | 38 | # Adaptive threshold (inverse for text as white) 39 | binarized = cv2.adaptiveThreshold( 40 | gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15 41 | ) 42 | 43 | num_labels, labels, stats, _ = cv2.connectedComponentsWithStats( 44 | binarized, connectivity=8 45 | ) 46 | cleaned = np.zeros_like(binarized) 47 | for i in range(1, num_labels): # skip background 48 | cleaned[labels == i] = 255 49 | 50 | kernel = np.ones((1, 5), np.uint8) 51 | dilated = cv2.dilate(cleaned, kernel, iterations=3) 52 | b = dilated / 255 53 | return b.sum() == 0 54 | 55 | def __call__(self, document: Document): 56 | if not self.filter_blank_pages: 57 | return 58 | 59 | for page in document.pages: 60 | structure_blocks = page.structure_blocks(document) 61 | if not structure_blocks or len(structure_blocks) > 1: 62 | continue 63 | 64 | full_page_block: Block = structure_blocks[0] 65 | 66 | conditions = [ 67 | full_page_block.block_type in [BlockTypes.Picture, BlockTypes.Figure], 68 | self.is_blank(full_page_block.get_image(document)), 69 | page.polygon.intersection_area(full_page_block.polygon) 70 | > self.full_page_block_intersection_threshold, 71 | ] 72 | 73 | if all(conditions): 74 | logger.debug(f"Removing blank block {full_page_block.id}") 75 | page.remove_structure_items([full_page_block.id]) 76 | full_page_block.removed = True 77 | ``` -------------------------------------------------------------------------------- /marker/processors/util.py: -------------------------------------------------------------------------------- ```python 1 | import re 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from marker.schema import BlockTypes 6 | from marker.schema.groups import PageGroup 7 | from marker.schema.registry import get_block_class 8 | from marker.schema.text import Line 9 | 10 | 11 | def escape_latex_commands(text: str): 12 | text = (text 13 | .replace('\n', '\\n') 14 | .replace('\t', '\\t') 15 | .replace('\r', '\\r')) 16 | return text 17 | 18 | 19 | def add_math_spans_to_line(corrected_text: str, text_line: Line, page: PageGroup): 20 | SpanClass = get_block_class(BlockTypes.Span) 21 | corrected_spans = text_to_spans(corrected_text) 22 | 23 | for span_idx, span in enumerate(corrected_spans): 24 | if span_idx == len(corrected_spans) - 1: 25 | span['content'] += "\n" 26 | 27 | span_block = page.add_full_block( 28 | SpanClass( 29 | polygon=text_line.polygon, 30 | text=span['content'], 31 | font='Unknown', 32 | font_weight=0, 33 | font_size=0, 34 | minimum_position=0, 35 | maximum_position=0, 36 | formats=[span['type']], 37 | url=span.get('url'), 38 | page_id=text_line.page_id, 39 | text_extraction_method="gemini", 40 | has_superscript=span["has_superscript"], 41 | has_subscript=span["has_subscript"] 42 | ) 43 | ) 44 | text_line.structure.append(span_block.id) 45 | 46 | 47 | def text_to_spans(text): 48 | soup = BeautifulSoup(text, 'html.parser') 49 | 50 | tag_types = { 51 | 'b': 'bold', 52 | 'i': 'italic', 53 | 'math': 'math', 54 | 'sub': 'plain', 55 | 'sup': 'plain', 56 | 'span': 'plain' 57 | } 58 | spans = [] 59 | 60 | for element in soup.descendants: 61 | if not len(list(element.parents)) == 1: 62 | continue 63 | 64 | url = element.attrs.get('href') if hasattr(element, 'attrs') else None 65 | 66 | if element.name in tag_types: 67 | text = element.get_text() 68 | if element.name == "math": 69 | text = escape_latex_commands(text) 70 | spans.append({ 71 | 'type': tag_types[element.name], 72 | 'content': text, 73 | 'url': url, 74 | "has_superscript": element.name == "sup", 75 | "has_subscript": element.name == "sub" 76 | }) 77 | elif element.string: 78 | spans.append({ 79 | 'type': 'plain', 80 | 'content': element.string, 81 | 'url': url, 82 | "has_superscript": False, 83 | "has_subscript": False 84 | }) 85 | 86 | return spans ``` -------------------------------------------------------------------------------- /marker/providers/__init__.py: -------------------------------------------------------------------------------- ```python 1 | from copy import deepcopy 2 | from typing import List, Optional, Dict 3 | 4 | from PIL import Image 5 | from pydantic import BaseModel 6 | 7 | from pdftext.schema import Reference 8 | 9 | from marker.logger import configure_logging 10 | from marker.schema.polygon import PolygonBox 11 | from marker.schema.text import Span 12 | from marker.schema.text.char import Char 13 | from marker.schema.text.line import Line 14 | from marker.settings import settings 15 | from marker.util import assign_config 16 | 17 | configure_logging() 18 | 19 | 20 | class ProviderOutput(BaseModel): 21 | line: Line 22 | spans: List[Span] 23 | chars: Optional[List[List[Char]]] = None 24 | 25 | @property 26 | def raw_text(self): 27 | return "".join(span.text for span in self.spans) 28 | 29 | def __hash__(self): 30 | return hash(tuple(self.line.polygon.bbox)) 31 | 32 | def merge(self, other: "ProviderOutput"): 33 | new_output = deepcopy(self) 34 | other_copy = deepcopy(other) 35 | 36 | new_output.spans.extend(other_copy.spans) 37 | if new_output.chars is not None and other_copy.chars is not None: 38 | new_output.chars.extend(other_copy.chars) 39 | elif other_copy.chars is not None: 40 | new_output.chars = other_copy.chars 41 | 42 | new_output.line.polygon = new_output.line.polygon.merge( 43 | [other_copy.line.polygon] 44 | ) 45 | return new_output 46 | 47 | 48 | ProviderPageLines = Dict[int, List[ProviderOutput]] 49 | 50 | 51 | class BaseProvider: 52 | def __init__(self, filepath: str, config: Optional[BaseModel | dict] = None): 53 | assign_config(self, config) 54 | self.filepath = filepath 55 | 56 | def __len__(self): 57 | pass 58 | 59 | def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]: 60 | pass 61 | 62 | def get_page_bbox(self, idx: int) -> PolygonBox | None: 63 | pass 64 | 65 | def get_page_lines(self, idx: int) -> List[Line]: 66 | pass 67 | 68 | def get_page_refs(self, idx: int) -> List[Reference]: 69 | pass 70 | 71 | def __enter__(self): 72 | return self 73 | 74 | @staticmethod 75 | def get_font_css(): 76 | from weasyprint import CSS 77 | from weasyprint.text.fonts import FontConfiguration 78 | 79 | font_config = FontConfiguration() 80 | css = CSS( 81 | string=f""" 82 | @font-face {{ 83 | font-family: GoNotoCurrent-Regular; 84 | src: url({settings.FONT_PATH}); 85 | font-display: swap; 86 | }} 87 | body {{ 88 | font-family: {settings.FONT_NAME.split(".")[0]}, sans-serif; 89 | font-variant-ligatures: none; 90 | font-feature-settings: "liga" 0; 91 | text-rendering: optimizeLegibility; 92 | }} 93 | """, 94 | font_config=font_config, 95 | ) 96 | return css 97 | ``` -------------------------------------------------------------------------------- /marker/providers/document.py: -------------------------------------------------------------------------------- ```python 1 | import base64 2 | import os 3 | import re 4 | import tempfile 5 | from io import BytesIO 6 | 7 | from PIL import Image 8 | from marker.logger import get_logger 9 | 10 | from marker.providers.pdf import PdfProvider 11 | 12 | logger = get_logger() 13 | 14 | css = """ 15 | @page { 16 | size: A4; 17 | margin: 2cm; 18 | } 19 | 20 | img { 21 | max-width: 100%; 22 | max-height: 25cm; 23 | object-fit: contain; 24 | margin: 12pt auto; 25 | } 26 | 27 | div, p { 28 | max-width: 100%; 29 | word-break: break-word; 30 | font-size: 10pt; 31 | } 32 | 33 | table { 34 | width: 100%; 35 | border-collapse: collapse; 36 | break-inside: auto; 37 | font-size: 10pt; 38 | } 39 | 40 | tr { 41 | break-inside: avoid; 42 | page-break-inside: avoid; 43 | } 44 | 45 | td { 46 | border: 0.75pt solid #000; 47 | padding: 6pt; 48 | } 49 | """ 50 | 51 | 52 | class DocumentProvider(PdfProvider): 53 | def __init__(self, filepath: str, config=None): 54 | temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") 55 | self.temp_pdf_path = temp_pdf.name 56 | temp_pdf.close() 57 | 58 | # Convert DOCX to PDF 59 | try: 60 | self.convert_docx_to_pdf(filepath) 61 | except Exception as e: 62 | raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}") 63 | 64 | # Initialize the PDF provider with the temp pdf path 65 | super().__init__(self.temp_pdf_path, config) 66 | 67 | def __del__(self): 68 | if os.path.exists(self.temp_pdf_path): 69 | os.remove(self.temp_pdf_path) 70 | 71 | def convert_docx_to_pdf(self, filepath: str): 72 | from weasyprint import CSS, HTML 73 | import mammoth 74 | 75 | with open(filepath, "rb") as docx_file: 76 | # we convert the docx to HTML 77 | result = mammoth.convert_to_html(docx_file) 78 | html = result.value 79 | 80 | # We convert the HTML into a PDF 81 | HTML(string=self._preprocess_base64_images(html)).write_pdf( 82 | self.temp_pdf_path, stylesheets=[CSS(string=css), self.get_font_css()] 83 | ) 84 | 85 | @staticmethod 86 | def _preprocess_base64_images(html_content): 87 | pattern = r'data:([^;]+);base64,([^"\'>\s]+)' 88 | 89 | def convert_image(match): 90 | try: 91 | img_data = base64.b64decode(match.group(2)) 92 | 93 | with BytesIO(img_data) as bio: 94 | with Image.open(bio) as img: 95 | output = BytesIO() 96 | img.save(output, format=img.format) 97 | new_base64 = base64.b64encode(output.getvalue()).decode() 98 | return f"data:{match.group(1)};base64,{new_base64}" 99 | 100 | except Exception as e: 101 | logger.error(f"Failed to process image: {e}") 102 | return "" # we ditch broken images as that breaks the PDF creation down the line 103 | 104 | return re.sub(pattern, convert_image, html_content) 105 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/olmocr.py: -------------------------------------------------------------------------------- ```python 1 | import base64 2 | import json 3 | import tempfile 4 | import time 5 | from io import BytesIO 6 | 7 | import torch 8 | from PIL import Image 9 | 10 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult 11 | 12 | 13 | def convert_single_page(filename: str, model, processor, device): 14 | from olmocr.data.renderpdf import render_pdf_to_base64png 15 | from olmocr.prompts import build_finetuning_prompt 16 | from olmocr.prompts.anchor import get_anchor_text 17 | 18 | image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1024) 19 | 20 | # Build the prompt, using document metadata 21 | anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=4000) 22 | prompt = build_finetuning_prompt(anchor_text) 23 | 24 | # Build the full prompt 25 | messages = [ 26 | { 27 | "role": "user", 28 | "content": [ 29 | {"type": "text", "text": prompt}, 30 | {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, 31 | ], 32 | } 33 | ] 34 | 35 | # Apply the chat template and processor 36 | text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 37 | main_image = Image.open(BytesIO(base64.b64decode(image_base64))) 38 | 39 | inputs = processor( 40 | text=[text], 41 | images=[main_image], 42 | padding=True, 43 | return_tensors="pt", 44 | ) 45 | inputs = {key: value.to(device) for (key, value) in inputs.items()} 46 | 47 | # Generate the output 48 | output = model.generate( 49 | **inputs, 50 | temperature=0.8, 51 | max_new_tokens=8192, 52 | num_return_sequences=1, 53 | do_sample=True, 54 | ) 55 | 56 | # Decode the output 57 | prompt_length = inputs["input_ids"].shape[1] 58 | new_tokens = output[:, prompt_length:] 59 | text_output = processor.tokenizer.batch_decode( 60 | new_tokens, skip_special_tokens=True 61 | )[0] 62 | 63 | try: 64 | text_output = json.loads(text_output) 65 | text = text_output["natural_text"] 66 | except Exception: 67 | try: 68 | text = text_output.split("natural_text")[1].strip() 69 | except Exception: 70 | text = "" 71 | 72 | return text 73 | 74 | 75 | class OlmOCRMethod(BaseMethod): 76 | olmocr_model: dict = None 77 | use_llm: bool = False 78 | 79 | def __call__(self, sample) -> BenchmarkResult: 80 | pdf_bytes = sample["pdf"] # This is a single page PDF 81 | 82 | with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: 83 | f.write(pdf_bytes) 84 | start = time.time() 85 | result = convert_single_page(f.name, self.olmocr_model["model"], self.olmocr_model["processor"], self.olmocr_model["model"].device) 86 | total = time.time() - start 87 | 88 | return { 89 | "markdown": result, 90 | "time": total 91 | } 92 | ``` -------------------------------------------------------------------------------- /tests/renderers/test_markdown_renderer.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.renderers.markdown import MarkdownRenderer 4 | from marker.schema import BlockTypes 5 | from marker.schema.blocks import TableCell 6 | 7 | 8 | @pytest.mark.config({"page_range": [0], "disable_ocr": True}) 9 | def test_markdown_renderer(pdf_document): 10 | renderer = MarkdownRenderer() 11 | md = renderer(pdf_document).markdown 12 | 13 | # Verify markdown 14 | assert "# Subspace Adversarial Training" in md 15 | 16 | 17 | @pytest.mark.config({"page_range": [0]}) 18 | def test_markdown_renderer_auto_ocr(pdf_document): 19 | renderer = MarkdownRenderer() 20 | md = renderer(pdf_document).markdown 21 | 22 | # Verify markdown 23 | assert "Subspace Adversarial Training" in md 24 | 25 | 26 | @pytest.mark.config({"page_range": [0, 1], "paginate_output": True}) 27 | def test_markdown_renderer_pagination(pdf_document): 28 | renderer = MarkdownRenderer({"paginate_output": True}) 29 | md = renderer(pdf_document).markdown 30 | 31 | assert "\n\n{0}-" in md 32 | assert "\n\n{1}-" in md 33 | 34 | 35 | @pytest.mark.config({"page_range": [0, 1], "paginate_output": True}) 36 | def test_markdown_renderer_pagination_blank_last_page(pdf_document): 37 | # Clear all children and structure from the last page to simulate a blank page 38 | last_page = pdf_document.pages[-1] 39 | last_page.children = [] 40 | last_page.structure = [] 41 | 42 | renderer = MarkdownRenderer({"paginate_output": True}) 43 | md = renderer(pdf_document).markdown 44 | 45 | # Should end with pagination marker and preserve trailing newlines 46 | assert md.endswith("}\n\n") or md.endswith( 47 | "}------------------------------------------------\n\n" 48 | ) 49 | 50 | 51 | @pytest.mark.config({"page_range": [0, 1]}) 52 | def test_markdown_renderer_metadata(pdf_document): 53 | renderer = MarkdownRenderer({"paginate_output": True}) 54 | metadata = renderer(pdf_document).metadata 55 | assert "table_of_contents" in metadata 56 | 57 | 58 | @pytest.mark.config({"page_range": [0, 1]}) 59 | def test_markdown_renderer_images(pdf_document): 60 | renderer = MarkdownRenderer({"extract_images": False}) 61 | markdown_output = renderer(pdf_document) 62 | 63 | assert len(markdown_output.images) == 0 64 | assert " 68 | def test_markdown_renderer_tables(pdf_document): 69 | table = pdf_document.contained_blocks((BlockTypes.Table,))[0] 70 | page = pdf_document.pages[0] 71 | 72 | cell = TableCell( 73 | polygon=table.polygon, 74 | text_lines=["54<i>.45</i>67<br>89<math>x</math>"], 75 | rowspan=1, 76 | colspan=1, 77 | row_id=0, 78 | col_id=0, 79 | is_header=False, 80 | page_id=page.page_id, 81 | ) 82 | page.add_full_block(cell) 83 | table.structure = [] 84 | table.add_structure(cell) 85 | 86 | renderer = MarkdownRenderer() 87 | md = renderer(pdf_document).markdown 88 | assert "54 <i>.45</i> 67<br>89 $x$" in md 89 | ``` -------------------------------------------------------------------------------- /marker/schema/registry.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Dict, Type 2 | from importlib import import_module 3 | 4 | from marker.schema import BlockTypes 5 | from marker.schema.blocks import ( 6 | Block, 7 | Caption, 8 | Code, 9 | Equation, 10 | Figure, 11 | Footnote, 12 | Form, 13 | Handwriting, 14 | InlineMath, 15 | ListItem, 16 | PageFooter, 17 | PageHeader, 18 | Picture, 19 | SectionHeader, 20 | Table, 21 | TableOfContents, 22 | Text, 23 | ComplexRegion, 24 | TableCell, 25 | Reference, 26 | ) 27 | from marker.schema.document import Document 28 | from marker.schema.groups import ( 29 | FigureGroup, 30 | ListGroup, 31 | PageGroup, 32 | PictureGroup, 33 | TableGroup, 34 | ) 35 | from marker.schema.text import Line, Span 36 | from marker.schema.text.char import Char 37 | 38 | BLOCK_REGISTRY: Dict[BlockTypes, str] = {} 39 | 40 | 41 | def register_block_class(block_type: BlockTypes, block_cls: Type[Block]): 42 | BLOCK_REGISTRY[block_type] = f"{block_cls.__module__}.{block_cls.__name__}" 43 | 44 | 45 | def get_block_class(block_type: BlockTypes) -> Type[Block]: 46 | class_path = BLOCK_REGISTRY[block_type] 47 | module_name, class_name = class_path.rsplit(".", 1) 48 | module = import_module(module_name) 49 | return getattr(module, class_name) 50 | 51 | 52 | register_block_class(BlockTypes.Line, Line) 53 | register_block_class(BlockTypes.Span, Span) 54 | register_block_class(BlockTypes.Char, Char) 55 | register_block_class(BlockTypes.FigureGroup, FigureGroup) 56 | register_block_class(BlockTypes.TableGroup, TableGroup) 57 | register_block_class(BlockTypes.ListGroup, ListGroup) 58 | register_block_class(BlockTypes.PictureGroup, PictureGroup) 59 | register_block_class(BlockTypes.Page, PageGroup) 60 | register_block_class(BlockTypes.Caption, Caption) 61 | register_block_class(BlockTypes.Code, Code) 62 | register_block_class(BlockTypes.Figure, Figure) 63 | register_block_class(BlockTypes.Footnote, Footnote) 64 | register_block_class(BlockTypes.Form, Form) 65 | register_block_class(BlockTypes.Equation, Equation) 66 | register_block_class(BlockTypes.Handwriting, Handwriting) 67 | register_block_class(BlockTypes.TextInlineMath, InlineMath) 68 | register_block_class(BlockTypes.ListItem, ListItem) 69 | register_block_class(BlockTypes.PageFooter, PageFooter) 70 | register_block_class(BlockTypes.PageHeader, PageHeader) 71 | register_block_class(BlockTypes.Picture, Picture) 72 | register_block_class(BlockTypes.SectionHeader, SectionHeader) 73 | register_block_class(BlockTypes.Table, Table) 74 | register_block_class(BlockTypes.Text, Text) 75 | register_block_class(BlockTypes.TableOfContents, TableOfContents) 76 | register_block_class(BlockTypes.ComplexRegion, ComplexRegion) 77 | register_block_class(BlockTypes.TableCell, TableCell) 78 | register_block_class(BlockTypes.Reference, Reference) 79 | register_block_class(BlockTypes.Document, Document) 80 | 81 | assert len(BLOCK_REGISTRY) == len(BlockTypes) 82 | assert all( 83 | [ 84 | get_block_class(k).model_fields["block_type"].default == k 85 | for k, _ in BLOCK_REGISTRY.items() 86 | ] 87 | ) 88 | ``` -------------------------------------------------------------------------------- /marker/processors/blockquote.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Annotated, Tuple 2 | 3 | from marker.processors import BaseProcessor 4 | from marker.schema import BlockTypes 5 | from marker.schema.document import Document 6 | 7 | 8 | class BlockquoteProcessor(BaseProcessor): 9 | """ 10 | A processor for tagging blockquotes. 11 | """ 12 | block_types: Annotated[ 13 | Tuple[BlockTypes], 14 | "The block types to process.", 15 | ] = (BlockTypes.Text, BlockTypes.TextInlineMath) 16 | min_x_indent: Annotated[ 17 | float, 18 | "The minimum horizontal indentation required to consider a block as part of a blockquote.", 19 | "Expressed as a percentage of the block width.", 20 | ] = 0.1 21 | x_start_tolerance: Annotated[ 22 | float, 23 | "The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.", 24 | "Expressed as a percentage of the block width.", 25 | ] = 0.01 26 | x_end_tolerance: Annotated[ 27 | float, 28 | "The maximum allowable difference between the ending x-coordinates of consecutive blocks to consider them aligned.", 29 | "Expressed as a percentage of the block width.", 30 | ] = 0.01 31 | 32 | def __init__(self, config): 33 | super().__init__(config) 34 | 35 | def __call__(self, document: Document): 36 | for page in document.pages: 37 | for block in page.contained_blocks(document, self.block_types): 38 | if block.structure is None: 39 | continue 40 | 41 | if not len(block.structure) >= 2: 42 | continue 43 | 44 | next_block = page.get_next_block(block) 45 | if next_block is None: 46 | continue 47 | if next_block.block_type not in self.block_types: 48 | continue 49 | if next_block.structure is None: 50 | continue 51 | if next_block.ignore_for_output: 52 | continue 53 | 54 | matching_x_end = abs(next_block.polygon.x_end - block.polygon.x_end) < self.x_end_tolerance * block.polygon.width 55 | matching_x_start = abs(next_block.polygon.x_start - block.polygon.x_start) < self.x_start_tolerance * block.polygon.width 56 | x_indent = next_block.polygon.x_start > block.polygon.x_start + (self.min_x_indent * block.polygon.width) 57 | y_indent = next_block.polygon.y_start > block.polygon.y_end 58 | 59 | if block.blockquote: 60 | next_block.blockquote = (matching_x_end and matching_x_start) or (x_indent and y_indent) 61 | next_block.blockquote_level = block.blockquote_level 62 | if (x_indent and y_indent): 63 | next_block.blockquote_level += 1 64 | elif len(next_block.structure) >= 2 and (x_indent and y_indent): 65 | next_block.blockquote = True 66 | next_block.blockquote_level = 1 ``` -------------------------------------------------------------------------------- /tests/services/test_service_init.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.converters.pdf import PdfConverter 4 | from marker.services.gemini import GoogleGeminiService 5 | from marker.services.ollama import OllamaService 6 | from marker.services.vertex import GoogleVertexService 7 | from marker.services.openai import OpenAIService 8 | from marker.services.azure_openai import AzureOpenAIService 9 | 10 | 11 | @pytest.mark.output_format("markdown") 12 | @pytest.mark.config({"page_range": [0]}) 13 | def test_empty_llm(pdf_converter: PdfConverter, temp_doc): 14 | assert pdf_converter.artifact_dict["llm_service"] is None 15 | assert pdf_converter.llm_service is None 16 | 17 | 18 | def test_llm_no_keys(model_dict, config): 19 | with pytest.raises(AssertionError): 20 | PdfConverter(artifact_dict=model_dict, config={"use_llm": True}) 21 | 22 | 23 | @pytest.mark.output_format("markdown") 24 | @pytest.mark.config({"page_range": [0], "use_llm": True, "gemini_api_key": "test"}) 25 | def test_llm_gemini(pdf_converter: PdfConverter, temp_doc): 26 | assert pdf_converter.artifact_dict["llm_service"] is not None 27 | assert isinstance(pdf_converter.llm_service, GoogleGeminiService) 28 | 29 | 30 | @pytest.mark.output_format("markdown") 31 | @pytest.mark.config( 32 | { 33 | "page_range": [0], 34 | "use_llm": True, 35 | "vertex_project_id": "test", 36 | "llm_service": "marker.services.vertex.GoogleVertexService", 37 | } 38 | ) 39 | def test_llm_vertex(pdf_converter: PdfConverter, temp_doc): 40 | assert pdf_converter.artifact_dict["llm_service"] is not None 41 | assert isinstance(pdf_converter.llm_service, GoogleVertexService) 42 | 43 | 44 | @pytest.mark.output_format("markdown") 45 | @pytest.mark.config( 46 | { 47 | "page_range": [0], 48 | "use_llm": True, 49 | "llm_service": "marker.services.ollama.OllamaService", 50 | } 51 | ) 52 | def test_llm_ollama(pdf_converter: PdfConverter, temp_doc): 53 | assert pdf_converter.artifact_dict["llm_service"] is not None 54 | assert isinstance(pdf_converter.llm_service, OllamaService) 55 | 56 | 57 | @pytest.mark.output_format("markdown") 58 | @pytest.mark.config( 59 | { 60 | "page_range": [0], 61 | "use_llm": True, 62 | "llm_service": "marker.services.openai.OpenAIService", 63 | "openai_api_key": "test", 64 | } 65 | ) 66 | def test_llm_openai(pdf_converter: PdfConverter, temp_doc): 67 | assert pdf_converter.artifact_dict["llm_service"] is not None 68 | assert isinstance(pdf_converter.llm_service, OpenAIService) 69 | 70 | 71 | @pytest.mark.output_format("markdown") 72 | @pytest.mark.config( 73 | { 74 | "page_range": [0], 75 | "use_llm": True, 76 | "llm_service": "marker.services.azure_openai.AzureOpenAIService", 77 | "azure_endpoint": "https://example.openai.azure.com", 78 | "azure_api_key": "test", 79 | "deployment_name": "test-model", 80 | "azure_api_version": "1", 81 | } 82 | ) 83 | def test_llm_azure_openai(pdf_converter: PdfConverter, temp_doc): 84 | assert pdf_converter.artifact_dict["llm_service"] is not None 85 | assert isinstance(pdf_converter.llm_service, AzureOpenAIService) 86 | ``` -------------------------------------------------------------------------------- /marker/converters/extraction.py: -------------------------------------------------------------------------------- ```python 1 | import re 2 | from typing import Annotated 3 | 4 | from marker.builders.document import DocumentBuilder 5 | from marker.builders.line import LineBuilder 6 | from marker.builders.ocr import OcrBuilder 7 | from marker.builders.structure import StructureBuilder 8 | from marker.converters.pdf import PdfConverter 9 | from marker.extractors.document import DocumentExtractor 10 | from marker.extractors.page import PageExtractor 11 | from marker.providers.registry import provider_from_filepath 12 | 13 | from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput 14 | from marker.renderers.markdown import MarkdownRenderer 15 | 16 | from marker.logger import get_logger 17 | 18 | logger = get_logger() 19 | 20 | 21 | class ExtractionConverter(PdfConverter): 22 | pattern: str = r"{\d+\}-{48}\n\n" 23 | existing_markdown: Annotated[ 24 | str, "Markdown that was already converted for extraction." 25 | ] = None 26 | 27 | def build_document(self, filepath: str): 28 | provider_cls = provider_from_filepath(filepath) 29 | layout_builder = self.resolve_dependencies(self.layout_builder_class) 30 | line_builder = self.resolve_dependencies(LineBuilder) 31 | ocr_builder = self.resolve_dependencies(OcrBuilder) 32 | provider = provider_cls(filepath, self.config) 33 | document = DocumentBuilder(self.config)( 34 | provider, layout_builder, line_builder, ocr_builder 35 | ) 36 | structure_builder_cls = self.resolve_dependencies(StructureBuilder) 37 | structure_builder_cls(document) 38 | 39 | for processor in self.processor_list: 40 | processor(document) 41 | 42 | return document, provider 43 | 44 | def __call__(self, filepath: str) -> ExtractionOutput: 45 | self.config["paginate_output"] = True # Ensure we can split the output properly 46 | self.config["output_format"] = ( 47 | "markdown" # Output must be markdown for extraction 48 | ) 49 | markdown = self.existing_markdown 50 | 51 | if not markdown: 52 | document, provider = self.build_document(filepath) 53 | self.page_count = len(document.pages) 54 | renderer = self.resolve_dependencies(MarkdownRenderer) 55 | output = renderer(document) 56 | markdown = output.markdown 57 | 58 | output_pages = re.split(self.pattern, markdown)[1:] # Split output into pages 59 | 60 | # This needs an LLM service for extraction, this sets it in the extractor 61 | if self.artifact_dict.get("llm_service") is None: 62 | self.artifact_dict["llm_service"] = self.resolve_dependencies( 63 | self.default_llm_service 64 | ) 65 | 66 | page_extractor = self.resolve_dependencies(PageExtractor) 67 | document_extractor = self.resolve_dependencies(DocumentExtractor) 68 | renderer = self.resolve_dependencies(ExtractionRenderer) 69 | 70 | # Inference in parallel 71 | notes = page_extractor(output_pages) 72 | document_output = document_extractor(notes) 73 | 74 | merged = renderer(document_output, markdown) 75 | return merged 76 | ``` -------------------------------------------------------------------------------- /marker/providers/epub.py: -------------------------------------------------------------------------------- ```python 1 | import base64 2 | import os 3 | import tempfile 4 | 5 | from bs4 import BeautifulSoup 6 | 7 | from marker.providers.pdf import PdfProvider 8 | 9 | css = ''' 10 | @page { 11 | size: A4; 12 | margin: 2cm; 13 | } 14 | 15 | img { 16 | max-width: 100%; 17 | max-height: 25cm; 18 | object-fit: contain; 19 | margin: 12pt auto; 20 | } 21 | 22 | div, p { 23 | max-width: 100%; 24 | word-break: break-word; 25 | font-size: 10pt; 26 | } 27 | 28 | table { 29 | width: 100%; 30 | border-collapse: collapse; 31 | break-inside: auto; 32 | font-size: 10pt; 33 | } 34 | 35 | tr { 36 | break-inside: avoid; 37 | page-break-inside: avoid; 38 | } 39 | 40 | td { 41 | border: 0.75pt solid #000; 42 | padding: 6pt; 43 | } 44 | ''' 45 | 46 | 47 | class EpubProvider(PdfProvider): 48 | def __init__(self, filepath: str, config=None): 49 | temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=f".pdf") 50 | self.temp_pdf_path = temp_pdf.name 51 | temp_pdf.close() 52 | 53 | # Convert Epub to PDF 54 | try: 55 | self.convert_epub_to_pdf(filepath) 56 | except Exception as e: 57 | raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}") 58 | 59 | # Initialize the PDF provider with the temp pdf path 60 | super().__init__(self.temp_pdf_path, config) 61 | 62 | def __del__(self): 63 | if os.path.exists(self.temp_pdf_path): 64 | os.remove(self.temp_pdf_path) 65 | 66 | def convert_epub_to_pdf(self, filepath): 67 | from weasyprint import CSS, HTML 68 | from ebooklib import epub 69 | import ebooklib 70 | 71 | ebook = epub.read_epub(filepath) 72 | 73 | styles = [] 74 | html_content = "" 75 | img_tags = {} 76 | 77 | for item in ebook.get_items(): 78 | if item.get_type() == ebooklib.ITEM_IMAGE: 79 | img_data = base64.b64encode(item.get_content()).decode("utf-8") 80 | img_tags[item.file_name] = f'data:{item.media_type};base64,{img_data}' 81 | elif item.get_type() == ebooklib.ITEM_STYLE: 82 | styles.append(item.get_content().decode('utf-8')) 83 | 84 | for item in ebook.get_items(): 85 | if item.get_type() == ebooklib.ITEM_DOCUMENT: 86 | html_content += item.get_content().decode("utf-8") 87 | 88 | soup = BeautifulSoup(html_content, 'html.parser') 89 | for img in soup.find_all('img'): 90 | src = img.get('src') 91 | if src: 92 | normalized_src = src.replace('../', '') 93 | if normalized_src in img_tags: 94 | img['src'] = img_tags[normalized_src] 95 | 96 | for image in soup.find_all('image'): 97 | src = image.get('xlink:href') 98 | if src: 99 | normalized_src = src.replace('../', '') 100 | if normalized_src in img_tags: 101 | image['xlink:href'] = img_tags[normalized_src] 102 | 103 | html_content = str(soup) 104 | full_style = ''.join([css]) # + styles) 105 | 106 | # we convert the epub to HTML 107 | HTML(string=html_content, base_url=filepath).write_pdf( 108 | self.temp_pdf_path, 109 | stylesheets=[CSS(string=full_style), self.get_font_css()] 110 | ) 111 | ``` -------------------------------------------------------------------------------- /tests/processors/test_table_processor.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List 2 | 3 | import pytest 4 | 5 | from marker.renderers.markdown import MarkdownRenderer 6 | from marker.schema import BlockTypes 7 | from marker.processors.table import TableProcessor 8 | from marker.schema.blocks import TableCell 9 | 10 | 11 | @pytest.mark.config({"page_range": [5]}) 12 | def test_table_processor( 13 | pdf_document, recognition_model, table_rec_model, detection_model 14 | ): 15 | processor = TableProcessor(recognition_model, table_rec_model, detection_model) 16 | processor(pdf_document) 17 | 18 | for block in pdf_document.pages[0].children: 19 | if block.block_type == BlockTypes.Table: 20 | children = block.contained_blocks(pdf_document, (BlockTypes.TableCell,)) 21 | assert children 22 | assert len(children) > 0 23 | assert isinstance(children[0], TableCell) 24 | 25 | assert len(pdf_document.contained_blocks((BlockTypes.Table,))) == 2 26 | 27 | renderer = MarkdownRenderer() 28 | table_output = renderer(pdf_document) 29 | assert "Schedule" in table_output.markdown 30 | 31 | 32 | @pytest.mark.filename("table_ex.pdf") 33 | @pytest.mark.config({"page_range": [0], "force_ocr": True}) 34 | def test_avoid_double_ocr( 35 | pdf_document, recognition_model, table_rec_model, detection_model 36 | ): 37 | tables = pdf_document.contained_blocks((BlockTypes.Table,)) 38 | lines = tables[0].contained_blocks(pdf_document, (BlockTypes.Line,)) 39 | assert len(lines) == 0 40 | 41 | processor = TableProcessor( 42 | recognition_model, table_rec_model, detection_model, config={"force_ocr": True} 43 | ) 44 | processor(pdf_document) 45 | 46 | renderer = MarkdownRenderer() 47 | table_output = renderer(pdf_document) 48 | assert "Participants" in table_output.markdown 49 | 50 | 51 | @pytest.mark.filename("multicol-blocks.pdf") 52 | @pytest.mark.config({"page_range": [3]}) 53 | def test_overlap_blocks( 54 | pdf_document, detection_model, recognition_model, table_rec_model 55 | ): 56 | page = pdf_document.pages[0] 57 | assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text( 58 | pdf_document 59 | ) 60 | 61 | processor = TableProcessor(recognition_model, table_rec_model, detection_model) 62 | processor(pdf_document) 63 | 64 | assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text( 65 | pdf_document 66 | ) 67 | 68 | 69 | @pytest.mark.filename("pres.pdf") 70 | @pytest.mark.config({"page_range": [4]}) 71 | def test_ocr_table(pdf_document, recognition_model, table_rec_model, detection_model): 72 | processor = TableProcessor(recognition_model, table_rec_model, detection_model) 73 | processor(pdf_document) 74 | 75 | renderer = MarkdownRenderer() 76 | table_output = renderer(pdf_document) 77 | assert "1.2E-38" in table_output.markdown 78 | 79 | 80 | @pytest.mark.config({"page_range": [11]}) 81 | def test_split_rows(pdf_document, recognition_model, table_rec_model, detection_model): 82 | processor = TableProcessor(recognition_model, table_rec_model, detection_model) 83 | processor(pdf_document) 84 | 85 | table = pdf_document.contained_blocks((BlockTypes.Table,))[-1] 86 | cells: List[TableCell] = table.contained_blocks( 87 | pdf_document, (BlockTypes.TableCell,) 88 | ) 89 | unique_rows = len(set([cell.row_id for cell in cells])) 90 | assert unique_rows == 6 91 | ``` -------------------------------------------------------------------------------- /marker/renderers/json.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Annotated, Dict, List, Tuple 2 | 3 | from pydantic import BaseModel 4 | 5 | from marker.renderers import BaseRenderer 6 | from marker.schema import BlockTypes 7 | from marker.schema.blocks import Block, BlockOutput 8 | from marker.schema.document import Document 9 | from marker.schema.registry import get_block_class 10 | 11 | 12 | class JSONBlockOutput(BaseModel): 13 | id: str 14 | block_type: str 15 | html: str 16 | polygon: List[List[float]] 17 | bbox: List[float] 18 | children: List["JSONBlockOutput"] | None = None 19 | section_hierarchy: Dict[int, str] | None = None 20 | images: dict | None = None 21 | 22 | 23 | class JSONOutput(BaseModel): 24 | children: List[JSONBlockOutput] 25 | block_type: str = str(BlockTypes.Document) 26 | metadata: dict 27 | 28 | 29 | def reformat_section_hierarchy(section_hierarchy): 30 | new_section_hierarchy = {} 31 | for key, value in section_hierarchy.items(): 32 | new_section_hierarchy[key] = str(value) 33 | return new_section_hierarchy 34 | 35 | 36 | class JSONRenderer(BaseRenderer): 37 | """ 38 | A renderer for JSON output. 39 | """ 40 | 41 | image_blocks: Annotated[ 42 | Tuple[BlockTypes], 43 | "The list of block types to consider as images.", 44 | ] = (BlockTypes.Picture, BlockTypes.Figure) 45 | page_blocks: Annotated[ 46 | Tuple[BlockTypes], 47 | "The list of block types to consider as pages.", 48 | ] = (BlockTypes.Page,) 49 | 50 | def extract_json(self, document: Document, block_output: BlockOutput): 51 | cls = get_block_class(block_output.id.block_type) 52 | if cls.__base__ == Block: 53 | html, images = self.extract_block_html(document, block_output) 54 | return JSONBlockOutput( 55 | html=html, 56 | polygon=block_output.polygon.polygon, 57 | bbox=block_output.polygon.bbox, 58 | id=str(block_output.id), 59 | block_type=str(block_output.id.block_type), 60 | images=images, 61 | section_hierarchy=reformat_section_hierarchy( 62 | block_output.section_hierarchy 63 | ), 64 | ) 65 | else: 66 | children = [] 67 | for child in block_output.children: 68 | child_output = self.extract_json(document, child) 69 | children.append(child_output) 70 | 71 | return JSONBlockOutput( 72 | html=block_output.html, 73 | polygon=block_output.polygon.polygon, 74 | bbox=block_output.polygon.bbox, 75 | id=str(block_output.id), 76 | block_type=str(block_output.id.block_type), 77 | children=children, 78 | section_hierarchy=reformat_section_hierarchy( 79 | block_output.section_hierarchy 80 | ), 81 | ) 82 | 83 | def __call__(self, document: Document) -> JSONOutput: 84 | document_output = document.render(self.block_config) 85 | json_output = [] 86 | for page_output in document_output.children: 87 | json_output.append(self.extract_json(document, page_output)) 88 | return JSONOutput( 89 | children=json_output, 90 | metadata=self.generate_document_metadata(document, document_output), 91 | ) 92 | ``` -------------------------------------------------------------------------------- /marker/processors/llm/llm_image_description.py: -------------------------------------------------------------------------------- ```python 1 | from pydantic import BaseModel 2 | 3 | from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData 4 | 5 | from marker.schema import BlockTypes 6 | from marker.schema.document import Document 7 | 8 | from typing import Annotated, List 9 | 10 | 11 | class LLMImageDescriptionProcessor(BaseLLMSimpleBlockProcessor): 12 | block_types = ( 13 | BlockTypes.Picture, 14 | BlockTypes.Figure, 15 | ) 16 | extract_images: Annotated[bool, "Extract images from the document."] = True 17 | image_description_prompt: Annotated[ 18 | str, 19 | "The prompt to use for generating image descriptions.", 20 | "Default is a string containing the Gemini prompt.", 21 | ] = """You are a document analysis expert who specializes in creating text descriptions for images. 22 | You will receive an image of a picture or figure. Your job will be to create a short description of the image. 23 | **Instructions:** 24 | 1. Carefully examine the provided image. 25 | 2. Analyze any text that was extracted from within the image. 26 | 3. Output a faithful description of the image. Make sure there is enough specific detail to accurately reconstruct the image. If the image is a figure or contains numeric data, include the numeric data in the output. 27 | **Example:** 28 | Input: 29 | ```text 30 | "Fruit Preference Survey" 31 | 20, 15, 10 32 | Apples, Bananas, Oranges 33 | ``` 34 | Output: 35 | In this figure, a bar chart titled "Fruit Preference Survey" is showing the number of people who prefer different types of fruits. The x-axis shows the types of fruits, and the y-axis shows the number of people. The bar chart shows that most people prefer apples, followed by bananas and oranges. 20 people prefer apples, 15 people prefer bananas, and 10 people prefer oranges. 36 | **Input:** 37 | ```text 38 | {raw_text} 39 | ``` 40 | """ 41 | 42 | def inference_blocks(self, document: Document) -> List[BlockData]: 43 | blocks = super().inference_blocks(document) 44 | if self.extract_images: 45 | return [] 46 | return blocks 47 | 48 | def block_prompts(self, document: Document) -> List[PromptData]: 49 | prompt_data = [] 50 | for block_data in self.inference_blocks(document): 51 | block = block_data["block"] 52 | prompt = self.image_description_prompt.replace( 53 | "{raw_text}", block.raw_text(document) 54 | ) 55 | image = self.extract_image(document, block) 56 | 57 | prompt_data.append( 58 | { 59 | "prompt": prompt, 60 | "image": image, 61 | "block": block, 62 | "schema": ImageSchema, 63 | "page": block_data["page"], 64 | } 65 | ) 66 | 67 | return prompt_data 68 | 69 | def rewrite_block( 70 | self, response: dict, prompt_data: PromptData, document: Document 71 | ): 72 | block = prompt_data["block"] 73 | 74 | if not response or "image_description" not in response: 75 | block.update_metadata(llm_error_count=1) 76 | return 77 | 78 | image_description = response["image_description"] 79 | if len(image_description) < 10: 80 | block.update_metadata(llm_error_count=1) 81 | return 82 | 83 | block.description = image_description 84 | 85 | 86 | class ImageSchema(BaseModel): 87 | image_description: str 88 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/display/table.py: -------------------------------------------------------------------------------- ```python 1 | from pathlib import Path 2 | from typing import Dict, List 3 | 4 | import tabulate 5 | 6 | from benchmarks.overall.schema import FullResult 7 | 8 | def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str): 9 | table = tabulate.tabulate(rows, headers=headers, tablefmt="github") 10 | with open(out_path / filename, "w", encoding="utf-8") as f: 11 | f.write(f"# {title}\n") 12 | f.write(table) 13 | print(title) 14 | print(table) 15 | 16 | 17 | def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"): 18 | document_types = list(result["averages_by_type"][default_method][default_score_type].keys()) 19 | headers = ["Document Type"] 20 | for method in methods: 21 | for score_type in score_types: 22 | headers.append(f"{method} {score_type}") 23 | 24 | document_rows = [[k] for k in document_types] 25 | for i, doc_type in enumerate(document_types): 26 | for method in methods: 27 | for score_type in score_types: 28 | avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type])) 29 | document_rows[i].append(avg_score) 30 | 31 | write_table("Document Types", document_rows, headers, out_path, "document_types.md") 32 | 33 | headers = ["Block Type"] 34 | block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks 35 | block_score_types = list(result["averages_by_block_type"][default_method].keys()) 36 | for method in methods: 37 | for score_type in block_score_types: 38 | headers.append(f"{method} {score_type}") 39 | 40 | block_rows = [[k] for k in block_types] 41 | for i, block_type in enumerate(block_types): 42 | for method in methods: 43 | for score_type in block_score_types: 44 | avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type])) 45 | block_rows[i].append(avg_score) 46 | 47 | write_table("Block types", block_rows, headers, out_path, "block_types.md") 48 | 49 | headers = ["Method", "Avg Time"] + score_types 50 | inference_rows = [[k] for k in methods] 51 | all_raw_scores = [result["scores"][i] for i in result["scores"]] 52 | for i, method in enumerate(methods): 53 | avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method])) 54 | inference_rows[i].append(avg_time) 55 | for score_type in score_types: 56 | scores_lst = [] 57 | for ar in all_raw_scores: 58 | try: 59 | # Sometimes a few llm scores are missing 60 | scores_lst.append(ar[method][score_type]["score"]) 61 | except KeyError: 62 | continue 63 | avg_score = sum(scores_lst) / max(1, len(scores_lst)) 64 | inference_rows[i].append(avg_score) 65 | 66 | write_table("Overall Results", inference_rows, headers, out_path, "overall.md") 67 | 68 | print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.") ``` -------------------------------------------------------------------------------- /marker/renderers/chunk.py: -------------------------------------------------------------------------------- ```python 1 | import html 2 | from typing import List, Dict 3 | 4 | from bs4 import BeautifulSoup 5 | from pydantic import BaseModel 6 | 7 | from marker.renderers.json import JSONRenderer, JSONBlockOutput 8 | from marker.schema.document import Document 9 | 10 | 11 | class FlatBlockOutput(BaseModel): 12 | id: str 13 | block_type: str 14 | html: str 15 | page: int 16 | polygon: List[List[float]] 17 | bbox: List[float] 18 | section_hierarchy: Dict[int, str] | None = None 19 | images: dict | None = None 20 | 21 | 22 | class ChunkOutput(BaseModel): 23 | blocks: List[FlatBlockOutput] 24 | page_info: Dict[int, dict] 25 | metadata: dict 26 | 27 | def collect_images(block: JSONBlockOutput) -> dict[str, str]: 28 | if not getattr(block, "children", None): 29 | return block.images or {} 30 | else: 31 | images = block.images or {} 32 | for child_block in block.children: 33 | images.update(collect_images(child_block)) 34 | return images 35 | 36 | def assemble_html_with_images(block: JSONBlockOutput, image_blocks: set[str]) -> str: 37 | if not getattr(block, "children", None): 38 | if block.block_type in image_blocks: 39 | return f"<p>{block.html}<img src='{block.id}'></p>" 40 | else: 41 | return block.html 42 | 43 | child_html = [assemble_html_with_images(child, image_blocks) for child in block.children] 44 | child_ids = [child.id for child in block.children] 45 | 46 | soup = BeautifulSoup(block.html, "html.parser") 47 | content_refs = soup.find_all("content-ref") 48 | for ref in content_refs: 49 | src_id = ref.attrs["src"] 50 | if src_id in child_ids: 51 | ref.replace_with(child_html[child_ids.index(src_id)]) 52 | 53 | return html.unescape(str(soup)) 54 | 55 | def json_to_chunks( 56 | block: JSONBlockOutput, image_blocks: set[str], page_id: int=0) -> FlatBlockOutput | List[FlatBlockOutput]: 57 | if block.block_type == "Page": 58 | children = block.children 59 | page_id = int(block.id.split("/")[-1]) 60 | return [json_to_chunks(child, image_blocks, page_id=page_id) for child in children] 61 | else: 62 | return FlatBlockOutput( 63 | id=block.id, 64 | block_type=block.block_type, 65 | html=assemble_html_with_images(block, image_blocks), 66 | page=page_id, 67 | polygon=block.polygon, 68 | bbox=block.bbox, 69 | section_hierarchy=block.section_hierarchy, 70 | images=collect_images(block), 71 | ) 72 | 73 | 74 | class ChunkRenderer(JSONRenderer): 75 | 76 | def __call__(self, document: Document) -> ChunkOutput: 77 | document_output = document.render(self.block_config) 78 | json_output = [] 79 | for page_output in document_output.children: 80 | json_output.append(self.extract_json(document, page_output)) 81 | 82 | # This will get the top-level blocks from every page 83 | chunk_output = [] 84 | for item in json_output: 85 | chunks = json_to_chunks(item, set([str(block) for block in self.image_blocks])) 86 | chunk_output.extend(chunks) 87 | 88 | page_info = { 89 | page.page_id: {"bbox": page.polygon.bbox, "polygon": page.polygon.polygon} 90 | for page in document.pages 91 | } 92 | 93 | return ChunkOutput( 94 | blocks=chunk_output, 95 | page_info=page_info, 96 | metadata=self.generate_document_metadata(document, document_output), 97 | ) 98 | ``` -------------------------------------------------------------------------------- /marker/schema/document.py: -------------------------------------------------------------------------------- ```python 1 | from __future__ import annotations 2 | 3 | from typing import List, Sequence, Optional 4 | 5 | from pydantic import BaseModel 6 | 7 | from marker.schema import BlockTypes 8 | from marker.schema.blocks import Block, BlockId, BlockOutput 9 | from marker.schema.groups.page import PageGroup 10 | 11 | 12 | class DocumentOutput(BaseModel): 13 | children: List[BlockOutput] 14 | html: str 15 | block_type: BlockTypes = BlockTypes.Document 16 | 17 | 18 | class TocItem(BaseModel): 19 | title: str 20 | heading_level: int 21 | page_id: int 22 | polygon: List[List[float]] 23 | 24 | 25 | class Document(BaseModel): 26 | filepath: str 27 | pages: List[PageGroup] 28 | block_type: BlockTypes = BlockTypes.Document 29 | table_of_contents: List[TocItem] | None = None 30 | debug_data_path: str | None = None # Path that debug data was saved to 31 | 32 | def get_block(self, block_id: BlockId): 33 | page = self.get_page(block_id.page_id) 34 | block = page.get_block(block_id) 35 | if block: 36 | return block 37 | return None 38 | 39 | def get_page(self, page_id): 40 | for page in self.pages: 41 | if page.page_id == page_id: 42 | return page 43 | return None 44 | 45 | def get_next_block( 46 | self, block: Block, ignored_block_types: List[BlockTypes] = None 47 | ): 48 | if ignored_block_types is None: 49 | ignored_block_types = [] 50 | next_block = None 51 | 52 | # Try to find the next block in the current page 53 | page = self.get_page(block.page_id) 54 | next_block = page.get_next_block(block, ignored_block_types) 55 | if next_block: 56 | return next_block 57 | 58 | # If no block found, search subsequent pages 59 | for page in self.pages[self.pages.index(page) + 1 :]: 60 | next_block = page.get_next_block(None, ignored_block_types) 61 | if next_block: 62 | return next_block 63 | return None 64 | 65 | def get_next_page(self, page: PageGroup): 66 | page_idx = self.pages.index(page) 67 | if page_idx + 1 < len(self.pages): 68 | return self.pages[page_idx + 1] 69 | return None 70 | 71 | def get_prev_block(self, block: Block): 72 | page = self.get_page(block.page_id) 73 | prev_block = page.get_prev_block(block) 74 | if prev_block: 75 | return prev_block 76 | prev_page = self.get_prev_page(page) 77 | if not prev_page: 78 | return None 79 | return prev_page.get_block(prev_page.structure[-1]) 80 | 81 | def get_prev_page(self, page: PageGroup): 82 | page_idx = self.pages.index(page) 83 | if page_idx > 0: 84 | return self.pages[page_idx - 1] 85 | return None 86 | 87 | def assemble_html( 88 | self, child_blocks: List[Block], block_config: Optional[dict] = None 89 | ): 90 | template = "" 91 | for c in child_blocks: 92 | template += f"<content-ref src='{c.id}'></content-ref>" 93 | return template 94 | 95 | def render(self, block_config: Optional[dict] = None): 96 | child_content = [] 97 | section_hierarchy = None 98 | for page in self.pages: 99 | rendered = page.render(self, None, section_hierarchy, block_config) 100 | section_hierarchy = rendered.section_hierarchy.copy() 101 | child_content.append(rendered) 102 | 103 | return DocumentOutput( 104 | children=child_content, 105 | html=self.assemble_html(child_content, block_config), 106 | ) 107 | 108 | def contained_blocks(self, block_types: Sequence[BlockTypes] = None) -> List[Block]: 109 | blocks = [] 110 | for page in self.pages: 111 | blocks += page.contained_blocks(self, block_types) 112 | return blocks 113 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/scorers/heuristic.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List 2 | 3 | from rapidfuzz import fuzz 4 | 5 | from benchmarks.overall.scorers.clean import MarkdownCleaner 6 | from benchmarks.overall.scorers.schema import BlockScores 7 | from benchmarks.overall.scorers import BaseScorer 8 | 9 | 10 | class HeuristicScorer(BaseScorer): 11 | def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores: 12 | if not method_markdown: 13 | return { 14 | "score": 0, 15 | "specific_scores": { 16 | "order": 0, 17 | "by_block": [0] * len(gt_markdown) 18 | } 19 | } 20 | 21 | # Standardize inputs 22 | gt_markdown = [self.clean_input(block) for block in gt_markdown] 23 | method_markdown = self.clean_input(method_markdown) 24 | 25 | alignments = self.find_fuzzy_alignments(method_markdown, gt_markdown) 26 | scores = [alignment["score"] for alignment in alignments] 27 | 28 | # Find order score 29 | orders = [alignment["start"] for alignment in alignments] 30 | correct_order = list(range(len(gt_markdown))) 31 | actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x]) 32 | order_score = self.kendall_tau(correct_order, actual_order) 33 | 34 | # Weight score by sequence length 35 | gt_weights = [len(g) for g in gt_markdown] 36 | weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)] 37 | 38 | # Weight the score by sequence length 39 | overall_score = sum(weighted_scores) / max(1, sum(gt_weights)) 40 | overall_score = overall_score * 0.8 + order_score * 0.2 41 | return { 42 | "score": overall_score, 43 | "specific_scores": { 44 | "order": order_score, 45 | "by_block": scores 46 | }, 47 | } 48 | 49 | @staticmethod 50 | def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float: 51 | n = len(correct_order) 52 | concordant = 0 53 | discordant = 0 54 | 55 | if n <= 1: 56 | return 100 57 | 58 | for i in range(n): 59 | for j in range(i + 1, n): 60 | correct_sign = correct_order[i] - correct_order[j] 61 | actual_sign = actual_order[i] - actual_order[j] 62 | 63 | if (correct_sign > 0 and actual_sign > 0) or (correct_sign < 0 and actual_sign < 0): 64 | concordant += 1 65 | elif (correct_sign < 0 and actual_sign > 0) or (correct_sign > 0 and actual_sign < 0): 66 | discordant += 1 67 | 68 | total_pairs = (n * (n - 1)) // 2 69 | tau = (concordant - discordant) / total_pairs 70 | tau = (tau + 1) / 2 # 0-1 scale 71 | return tau * 100 # 0-100 scale 72 | 73 | @staticmethod 74 | def find_fuzzy_alignments( 75 | main_string: str, 76 | substrings: List[str], 77 | threshold: int = 70 78 | ) -> List[dict]: 79 | alignments = [] 80 | 81 | for idx, substr in enumerate(substrings): 82 | result = fuzz.partial_ratio_alignment(substr, main_string, score_cutoff=threshold) 83 | 84 | score = 0 85 | dest_start = 0 86 | dest_end = 0 87 | if result: 88 | score = result.score 89 | dest_start = result.dest_start 90 | dest_end = result.dest_end 91 | 92 | alignments.append({ 93 | "string": substr, 94 | "start": dest_start, 95 | "end": dest_end, 96 | "score": score, 97 | "idx": idx 98 | }) 99 | return alignments 100 | 101 | 102 | @staticmethod 103 | def clean_input(md: str): 104 | cleaner = MarkdownCleaner() 105 | return cleaner(md) ``` -------------------------------------------------------------------------------- /marker/output.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import os 3 | 4 | from bs4 import BeautifulSoup, Tag 5 | from pydantic import BaseModel 6 | from PIL import Image 7 | 8 | from marker.renderers.extraction import ExtractionOutput 9 | from marker.renderers.html import HTMLOutput 10 | from marker.renderers.json import JSONOutput, JSONBlockOutput 11 | from marker.renderers.markdown import MarkdownOutput 12 | from marker.renderers.ocr_json import OCRJSONOutput 13 | from marker.schema.blocks import BlockOutput 14 | from marker.settings import settings 15 | 16 | 17 | def unwrap_outer_tag(html: str): 18 | soup = BeautifulSoup(html, "html.parser") 19 | contents = list(soup.contents) 20 | if len(contents) == 1 and isinstance(contents[0], Tag) and contents[0].name == "p": 21 | # Unwrap the p tag 22 | soup.p.unwrap() 23 | 24 | return str(soup) 25 | 26 | 27 | def json_to_html(block: JSONBlockOutput | BlockOutput): 28 | # Utility function to take in json block output and give html for the block. 29 | if not getattr(block, "children", None): 30 | return block.html 31 | else: 32 | child_html = [json_to_html(child) for child in block.children] 33 | child_ids = [child.id for child in block.children] 34 | 35 | soup = BeautifulSoup(block.html, "html.parser") 36 | content_refs = soup.find_all("content-ref") 37 | for ref in content_refs: 38 | src_id = ref.attrs["src"] 39 | if src_id in child_ids: 40 | child_soup = BeautifulSoup( 41 | child_html[child_ids.index(src_id)], "html.parser" 42 | ) 43 | ref.replace_with(child_soup) 44 | return str(soup) 45 | 46 | 47 | def output_exists(output_dir: str, fname_base: str): 48 | exts = ["md", "html", "json"] 49 | for ext in exts: 50 | if os.path.exists(os.path.join(output_dir, f"{fname_base}.{ext}")): 51 | return True 52 | return False 53 | 54 | 55 | def text_from_rendered(rendered: BaseModel): 56 | from marker.renderers.chunk import ChunkOutput # Has an import from this file 57 | 58 | if isinstance(rendered, MarkdownOutput): 59 | return rendered.markdown, "md", rendered.images 60 | elif isinstance(rendered, HTMLOutput): 61 | return rendered.html, "html", rendered.images 62 | elif isinstance(rendered, JSONOutput): 63 | return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {} 64 | elif isinstance(rendered, ChunkOutput): 65 | return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {} 66 | elif isinstance(rendered, OCRJSONOutput): 67 | return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {} 68 | elif isinstance(rendered, ExtractionOutput): 69 | return rendered.document_json, "json", {} 70 | else: 71 | raise ValueError("Invalid output type") 72 | 73 | 74 | def convert_if_not_rgb(image: Image.Image) -> Image.Image: 75 | if image.mode != "RGB": 76 | image = image.convert("RGB") 77 | return image 78 | 79 | 80 | def save_output(rendered: BaseModel, output_dir: str, fname_base: str): 81 | text, ext, images = text_from_rendered(rendered) 82 | text = text.encode(settings.OUTPUT_ENCODING, errors="replace").decode( 83 | settings.OUTPUT_ENCODING 84 | ) 85 | 86 | with open( 87 | os.path.join(output_dir, f"{fname_base}.{ext}"), 88 | "w+", 89 | encoding=settings.OUTPUT_ENCODING, 90 | ) as f: 91 | f.write(text) 92 | with open( 93 | os.path.join(output_dir, f"{fname_base}_meta.json"), 94 | "w+", 95 | encoding=settings.OUTPUT_ENCODING, 96 | ) as f: 97 | f.write(json.dumps(rendered.metadata, indent=2)) 98 | 99 | for img_name, img in images.items(): 100 | img = convert_if_not_rgb(img) # RGBA images can't save as JPG 101 | img.save(os.path.join(output_dir, img_name), settings.OUTPUT_IMAGE_FORMAT) 102 | ``` -------------------------------------------------------------------------------- /marker/processors/block_relabel.py: -------------------------------------------------------------------------------- ```python 1 | from copy import deepcopy 2 | from typing import Annotated 3 | 4 | from marker.processors import BaseProcessor 5 | from marker.schema import BlockTypes 6 | from marker.schema.blocks import BlockId 7 | from marker.schema.document import Document 8 | from marker.schema.registry import get_block_class 9 | 10 | from marker.logger import get_logger 11 | logger = get_logger() 12 | 13 | class BlockRelabelProcessor(BaseProcessor): 14 | """ 15 | A processor to heuristically relabel blocks based on a confidence threshold. 16 | 17 | Each rule in the relabel string maps an original block label to a new one 18 | if the confidence exceeds a given threshold. 19 | """ 20 | 21 | block_relabel_str: Annotated[ 22 | str, 23 | "Comma-separated relabeling rules in the format '<original_label>:<new_label>:<confidence_threshold>'.", 24 | "Each rule defines how blocks of a certain type should be relabeled when the confidence exceeds the threshold.", 25 | "Example: 'Table:Picture:0.85,Form:Picture:0.9'" 26 | ] = "" 27 | 28 | def __init__(self, config=None): 29 | super().__init__(config) 30 | self.block_relabel_map = {} 31 | 32 | if not self.block_relabel_str: 33 | return 34 | 35 | for i, block_config_str in enumerate(self.block_relabel_str.split(',')): 36 | block_config_str = block_config_str.strip() 37 | if not block_config_str: 38 | continue # Skip empty segments 39 | 40 | try: 41 | parts = block_config_str.split(':') 42 | if len(parts) != 3: 43 | raise ValueError(f"Expected 3 parts, got {len(parts)}") 44 | 45 | block_label, block_relabel, confidence_str = parts 46 | confidence_thresh = float(confidence_str) 47 | 48 | block_type = BlockTypes[block_label] 49 | relabel_block_type = BlockTypes[block_relabel] 50 | 51 | self.block_relabel_map[block_type] = ( 52 | confidence_thresh, 53 | relabel_block_type 54 | ) 55 | except Exception as e: 56 | logger.warning(f"Failed to parse relabel rule '{block_config_str}' at index {i}: {e}. Expected format is <original_label>:<new_label>:<confidence_threshold>") 57 | 58 | def __call__(self, document: Document): 59 | if len(self.block_relabel_map) == 0: 60 | return 61 | 62 | for page in document.pages: 63 | for block in page.structure_blocks(document): 64 | if block.block_type not in self.block_relabel_map: 65 | continue 66 | 67 | block_id = BlockId(page_id=page.page_id, block_id=block.block_id, block_type=block.block_type) 68 | confidence_thresh, relabel_block_type = self.block_relabel_map[block.block_type] 69 | confidence = block.top_k.get(block.block_type) 70 | if confidence > confidence_thresh: 71 | logger.debug(f"Skipping relabel for {block_id}; Confidence: {confidence} > Confidence Threshold {confidence_thresh} for re-labelling") 72 | continue 73 | 74 | new_block_cls = get_block_class(relabel_block_type) 75 | new_block = new_block_cls( 76 | polygon=deepcopy(block.polygon), 77 | page_id=block.page_id, 78 | structure=deepcopy(block.structure), 79 | text_extraction_method=block.text_extraction_method, 80 | source="heuristics", 81 | top_k=block.top_k, 82 | metadata=block.metadata 83 | ) 84 | page.replace_block(block, new_block) 85 | logger.debug(f"Relabelled {block_id} to {relabel_block_type}") ``` -------------------------------------------------------------------------------- /marker/providers/spreadsheet.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | 4 | from marker.providers.pdf import PdfProvider 5 | 6 | css = ''' 7 | @page { 8 | size: A4 landscape; 9 | margin: 1.5cm; 10 | } 11 | 12 | table { 13 | width: 100%; 14 | border-collapse: collapse; 15 | break-inside: auto; 16 | font-size: 10pt; 17 | } 18 | 19 | tr { 20 | break-inside: avoid; 21 | page-break-inside: avoid; 22 | } 23 | 24 | td { 25 | border: 0.75pt solid #000; 26 | padding: 6pt; 27 | } 28 | ''' 29 | 30 | 31 | class SpreadSheetProvider(PdfProvider): 32 | def __init__(self, filepath: str, config=None): 33 | temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=f".pdf") 34 | self.temp_pdf_path = temp_pdf.name 35 | temp_pdf.close() 36 | 37 | # Convert XLSX to PDF 38 | try: 39 | self.convert_xlsx_to_pdf(filepath) 40 | except Exception as e: 41 | raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}") 42 | 43 | # Initialize the PDF provider with the temp pdf path 44 | super().__init__(self.temp_pdf_path, config) 45 | 46 | def __del__(self): 47 | if os.path.exists(self.temp_pdf_path): 48 | os.remove(self.temp_pdf_path) 49 | 50 | def convert_xlsx_to_pdf(self, filepath: str): 51 | from weasyprint import CSS, HTML 52 | from openpyxl import load_workbook 53 | 54 | html = "" 55 | workbook = load_workbook(filepath) 56 | if workbook is not None: 57 | for sheet_name in workbook.sheetnames: 58 | sheet = workbook[sheet_name] 59 | html += f'<div><h1>{sheet_name}</h1>' + self._excel_to_html_table(sheet) + '</div>' 60 | else: 61 | raise ValueError("Invalid XLSX file") 62 | 63 | # We convert the HTML into a PDF 64 | HTML(string=html).write_pdf( 65 | self.temp_pdf_path, 66 | stylesheets=[CSS(string=css), self.get_font_css()] 67 | ) 68 | 69 | @staticmethod 70 | def _get_merged_cell_ranges(sheet): 71 | merged_info = {} 72 | for merged_range in sheet.merged_cells.ranges: 73 | min_col, min_row, max_col, max_row = merged_range.bounds 74 | merged_info[(min_row, min_col)] = { 75 | 'rowspan': max_row - min_row + 1, 76 | 'colspan': max_col - min_col + 1, 77 | 'range': merged_range 78 | } 79 | return merged_info 80 | 81 | def _excel_to_html_table(self, sheet): 82 | merged_cells = self._get_merged_cell_ranges(sheet) 83 | 84 | html = f'<table>' 85 | 86 | # Track cells we should skip due to being part of a merge range 87 | skip_cells = set() 88 | 89 | for row_idx, row in enumerate(sheet.rows, 1): 90 | html += '<tr>' 91 | for col_idx, cell in enumerate(row, 1): 92 | if (row_idx, col_idx) in skip_cells: 93 | continue 94 | 95 | # Check if this cell is the start of a merged range 96 | merge_info = merged_cells.get((row_idx, col_idx)) 97 | if merge_info: 98 | # Add cells to skip 99 | for r in range(row_idx, row_idx + merge_info['rowspan']): 100 | for c in range(col_idx, col_idx + merge_info['colspan']): 101 | if (r, c) != (row_idx, col_idx): 102 | skip_cells.add((r, c)) 103 | 104 | # Add merged cell with rowspan/colspan 105 | value = cell.value if cell.value is not None else '' 106 | html += f'<td rowspan="{merge_info["rowspan"]}" colspan="{merge_info["colspan"]}">{value}' 107 | else: 108 | # Regular cell 109 | value = cell.value if cell.value is not None else '' 110 | html += f'<td>{value}' 111 | 112 | html += '</td>' 113 | html += '</tr>' 114 | html += '</table>' 115 | return html 116 | ``` -------------------------------------------------------------------------------- /marker/processors/ignoretext.py: -------------------------------------------------------------------------------- ```python 1 | import re 2 | from collections import Counter 3 | from itertools import groupby 4 | from typing import Annotated, List 5 | 6 | from rapidfuzz import fuzz 7 | 8 | from marker.processors import BaseProcessor 9 | from marker.schema import BlockTypes 10 | from marker.schema.blocks import Block 11 | from marker.schema.document import Document 12 | 13 | 14 | class IgnoreTextProcessor(BaseProcessor): 15 | """ 16 | A processor for identifying and ignoring common text blocks in a document. 17 | These blocks often represent repetitive or non-essential elements, such as headers, footers, or page numbers. 18 | """ 19 | block_types = ( 20 | BlockTypes.Text, BlockTypes.SectionHeader, 21 | BlockTypes.TextInlineMath 22 | ) 23 | common_element_threshold: Annotated[ 24 | float, 25 | "The minimum ratio of pages a text block must appear on to be considered a common element.", 26 | "Blocks that meet or exceed this threshold are marked as common elements.", 27 | ] = 0.2 28 | common_element_min_blocks: Annotated[ 29 | int, 30 | "The minimum number of occurrences of a text block within a document to consider it a common element.", 31 | "This ensures that rare blocks are not mistakenly flagged.", 32 | ] = 3 33 | max_streak: Annotated[ 34 | int, 35 | "The maximum number of consecutive occurrences of a text block allowed before it is classified as a common element.", 36 | "Helps to identify patterns like repeated headers or footers.", 37 | ] = 3 38 | text_match_threshold: Annotated[ 39 | int, 40 | "The minimum fuzzy match score (0-100) required to classify a text block as similar to a common element.", 41 | "Higher values enforce stricter matching.", 42 | ] = 90 43 | 44 | def __call__(self, document: Document): 45 | first_blocks = [] 46 | last_blocks = [] 47 | for page in document.pages: 48 | initial_block = None 49 | last_block = None 50 | for block in page.contained_blocks(document, self.block_types): 51 | if block.structure is not None: 52 | if initial_block is None: 53 | initial_block = block 54 | 55 | last_block = block 56 | 57 | if initial_block is not None: 58 | first_blocks.append(initial_block) 59 | if last_block is not None: 60 | last_blocks.append(last_block) 61 | 62 | self.filter_common_elements(document, first_blocks) 63 | self.filter_common_elements(document, last_blocks) 64 | 65 | @staticmethod 66 | def clean_text(text): 67 | text = text.replace("\n", "").strip() 68 | text = re.sub(r"^\d+\s*", "", text) # remove numbers at the start of the line 69 | text = re.sub(r"\s*\d+$", "", text) # remove numbers at the end of the line 70 | return text 71 | 72 | def filter_common_elements(self, document, blocks: List[Block]): 73 | # We can't filter if we don't have enough pages to find common elements 74 | if len(blocks) < self.common_element_min_blocks: 75 | return 76 | 77 | text = [self.clean_text(b.raw_text(document)) for b in blocks] 78 | 79 | streaks = {} 80 | for key, group in groupby(text): 81 | streaks[key] = max(streaks.get(key, 0), len(list(group))) 82 | 83 | counter = Counter(text) 84 | common = [ 85 | k for k, v in counter.items() 86 | if (v >= len(blocks) * self.common_element_threshold or streaks[k] >= self.max_streak) 87 | and v > self.common_element_min_blocks 88 | ] 89 | if len(common) == 0: 90 | return 91 | 92 | for t, b in zip(text, blocks): 93 | # Check against all common elements 94 | if any(fuzz.ratio(t, common_element) > self.text_match_threshold for common_element in common): 95 | b.ignore_for_output = True 96 | ``` -------------------------------------------------------------------------------- /marker/processors/llm/llm_complex.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List 2 | 3 | import markdown2 4 | from pydantic import BaseModel 5 | 6 | from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor 7 | 8 | from marker.schema import BlockTypes 9 | from marker.schema.document import Document 10 | 11 | 12 | class LLMComplexRegionProcessor(BaseLLMSimpleBlockProcessor): 13 | block_types = (BlockTypes.ComplexRegion,) 14 | complex_region_prompt = """You are a text correction expert specializing in accurately reproducing text from images. 15 | You will receive an image of a text block and the text that can be extracted from the image. 16 | Your task is to generate markdown to properly represent the content of the image. Do not omit any text present in the image - make sure everything is included in the markdown representation. The markdown representation should be as faithful to the original image as possible. 17 | 18 | Formatting should be in markdown, with the following rules: 19 | - * for italics, ** for bold, and ` for inline code. 20 | - Use <sup>...</sup> for superscripts. 21 | - Headers should be formatted with #, with one # for the largest header, and up to 6 for the smallest. 22 | - Lists should be formatted with either - or 1. for unordered and ordered lists, respectively. 23 | - Links should be formatted with [text](url). 24 | - Use ``` for code blocks. 25 | - Inline math should be formatted with <math>math expression</math>. 26 | - Display math should be formatted with <math display="block">math expression</math>. 27 | - Values and labels should be extracted from forms, and put into markdown tables, with the labels on the left side, and values on the right. The headers should be "Labels" and "Values". Other text in the form can appear between the tables. 28 | - Tables should be formatted with markdown tables, with the headers bolded. 29 | 30 | **Instructions:** 31 | 1. Carefully examine the provided block image. 32 | 2. Analyze the existing text representation. 33 | 3. Generate the markdown representation of the content in the image. 34 | **Example:** 35 | Input: 36 | ```text 37 | Table 1: Car Sales 38 | ``` 39 | Output: 40 | ```markdown 41 | ## Table 1: Car Sales 42 | 43 | | Car | Sales | 44 | | --- | --- | 45 | | Honda | 100 | 46 | | Toyota | 200 | 47 | ``` 48 | **Input:** 49 | ```text 50 | {extracted_text} 51 | ``` 52 | """ 53 | 54 | def block_prompts(self, document: Document) -> List[PromptData]: 55 | prompt_data = [] 56 | for block in self.inference_blocks(document): 57 | text = block["block"].raw_text(document) 58 | prompt = self.complex_region_prompt.replace("{extracted_text}", text) 59 | image = self.extract_image(document, block["block"]) 60 | prompt_data.append({ 61 | "prompt": prompt, 62 | "image": image, 63 | "block": block["block"], 64 | "schema": ComplexSchema, 65 | "page": block["page"] 66 | }) 67 | return prompt_data 68 | 69 | def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document): 70 | block = prompt_data["block"] 71 | text = block.raw_text(document) 72 | 73 | if not response or "corrected_markdown" not in response: 74 | block.update_metadata(llm_error_count=1) 75 | return 76 | 77 | corrected_markdown = response["corrected_markdown"] 78 | 79 | # The original table is okay 80 | if "no corrections" in corrected_markdown.lower(): 81 | return 82 | 83 | # Potentially a partial response 84 | if len(corrected_markdown) < len(text) * .5: 85 | block.update_metadata(llm_error_count=1) 86 | return 87 | 88 | # Convert LLM markdown to html 89 | corrected_markdown = corrected_markdown.strip().lstrip("```markdown").rstrip("```").strip() 90 | block.html = markdown2.markdown(corrected_markdown, extras=["tables"]) 91 | 92 | class ComplexSchema(BaseModel): 93 | corrected_markdown: str ``` -------------------------------------------------------------------------------- /tests/converters/test_pdf_converter.py: -------------------------------------------------------------------------------- ```python 1 | import io 2 | 3 | import pytest 4 | from marker.converters.pdf import PdfConverter 5 | from marker.renderers.markdown import MarkdownOutput 6 | 7 | 8 | @pytest.mark.output_format("markdown") 9 | @pytest.mark.config({"page_range": [0, 1, 2, 3, 7], "disable_ocr": True}) 10 | def test_pdf_converter(pdf_converter: PdfConverter, temp_doc): 11 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name) 12 | markdown = markdown_output.markdown 13 | 14 | # Basic assertions 15 | assert len(markdown) > 0 16 | assert "# Subspace Adversarial Training" in markdown 17 | 18 | # Some assertions for line joining across pages 19 | assert ( 20 | "AT solutions. However, these methods highly rely on specifically" in markdown 21 | ) # pgs: 1-2 22 | assert ( 23 | "(with adversarial perturbations), which harms natural accuracy, " in markdown 24 | ) # pgs: 3-4 25 | 26 | # Some assertions for line joining across columns 27 | assert "remain similar across a wide range of choices." in markdown # pg: 2 28 | assert "a new scheme for designing more robust and efficient" in markdown # pg: 8 29 | 30 | 31 | @pytest.mark.filename("manual.epub") 32 | @pytest.mark.config({"page_range": [0]}) 33 | def test_epub_converter(pdf_converter: PdfConverter, temp_doc): 34 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name) 35 | markdown = markdown_output.markdown 36 | 37 | # Basic assertions 38 | assert "Simple Sabotage Field Manual" in markdown 39 | 40 | 41 | @pytest.mark.filename("single_sheet.xlsx") 42 | @pytest.mark.config({"page_range": [0]}) 43 | def test_xlsx_converter(pdf_converter: PdfConverter, temp_doc): 44 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name) 45 | markdown = markdown_output.markdown 46 | 47 | # Basic assertions 48 | assert "four" in markdown 49 | 50 | 51 | @pytest.mark.filename("china.html") 52 | @pytest.mark.config({"page_range": [10]}) 53 | def test_html_converter(pdf_converter: PdfConverter, temp_doc): 54 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name) 55 | markdown = markdown_output.markdown 56 | 57 | # Basic assertions 58 | assert "Republic of China" in markdown 59 | 60 | 61 | @pytest.mark.filename("gatsby.docx") 62 | @pytest.mark.config({"page_range": [0]}) 63 | def test_docx_converter(pdf_converter: PdfConverter, temp_doc): 64 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name) 65 | markdown = markdown_output.markdown 66 | 67 | # Basic assertions 68 | assert "The Decline of the American Dream in the 1920s" in markdown 69 | 70 | 71 | @pytest.mark.filename("lambda.pptx") 72 | @pytest.mark.config({"page_range": [0]}) 73 | def test_pptx_converter(pdf_converter: PdfConverter, temp_doc): 74 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name) 75 | markdown = markdown_output.markdown 76 | 77 | # Basic assertions 78 | assert "Adam Doupé" in markdown 79 | 80 | 81 | @pytest.mark.output_format("markdown") 82 | @pytest.mark.config({"page_range": [0, 1, 2, 3, 7], "disable_ocr": True}) 83 | def test_pdf_converter_bytes(pdf_converter: PdfConverter, temp_doc): 84 | with open(temp_doc.name, "rb") as f: 85 | data = f.read() 86 | 87 | input_bytes = io.BytesIO(data) 88 | markdown_output: MarkdownOutput = pdf_converter(input_bytes) 89 | markdown = markdown_output.markdown 90 | 91 | # Basic assertions 92 | assert len(markdown) > 0 93 | assert "# Subspace Adversarial Training" in markdown 94 | 95 | # Some assertions for line joining across pages 96 | assert ( 97 | "AT solutions. However, these methods highly rely on specifically" in markdown 98 | ) # pgs: 1-2 99 | assert ( 100 | "(with adversarial perturbations), which harms natural accuracy, " in markdown 101 | ) # pgs: 3-4 102 | 103 | # Some assertions for line joining across columns 104 | assert "remain similar across a wide range of choices." in markdown # pg: 2 105 | assert "a new scheme for designing more robust and efficient" in markdown # pg: 8 106 | ``` -------------------------------------------------------------------------------- /benchmarks/table/table.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for an op, which is not supported on MPS 3 | 4 | from pathlib import Path 5 | from itertools import repeat 6 | from typing import List 7 | 8 | import time 9 | import datasets 10 | from tqdm import tqdm 11 | import click 12 | from tabulate import tabulate 13 | import json 14 | from concurrent.futures import ProcessPoolExecutor 15 | 16 | from marker.settings import settings 17 | from benchmarks.table.inference import inference_tables 18 | 19 | from scoring import wrap_table_html, similarity_eval_html 20 | 21 | def update_teds_score(result, prefix: str = "marker"): 22 | prediction, ground_truth = result[f'{prefix}_table'], result['gt_table'] 23 | prediction, ground_truth = wrap_table_html(prediction), wrap_table_html(ground_truth) 24 | score = similarity_eval_html(prediction, ground_truth) 25 | result.update({f'{prefix}_score':score}) 26 | return result 27 | 28 | 29 | @click.command(help="Benchmark Table to HTML Conversion") 30 | @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "table"), help="Output path for results.") 31 | @click.option("--dataset", type=str, default="datalab-to/fintabnet_bench_marker", help="Dataset to use") 32 | @click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process") 33 | @click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use") 34 | @click.option("--use_llm", is_flag=True, help="Use LLM for improving table recognition.") 35 | @click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.") 36 | @click.option("--use_gemini", is_flag=True, help="Evaluate Gemini for table recognition.") 37 | def main( 38 | result_path: str, 39 | dataset: str, 40 | max_rows: int, 41 | max_workers: int, 42 | use_llm: bool, 43 | table_rec_batch_size: int | None, 44 | use_gemini: bool = False 45 | ): 46 | start = time.time() 47 | 48 | 49 | dataset = datasets.load_dataset(dataset, split='train') 50 | dataset = dataset.shuffle(seed=0) 51 | 52 | results, total_unaligned = inference_tables(dataset, use_llm, table_rec_batch_size, max_rows, use_gemini) 53 | 54 | print(f"Total time: {time.time() - start}.") 55 | print(f"Could not align {total_unaligned} tables from fintabnet.") 56 | 57 | with ProcessPoolExecutor(max_workers=max_workers) as executor: 58 | marker_results = list( 59 | tqdm( 60 | executor.map(update_teds_score, results), desc='Computing alignment scores', total=len(results) 61 | ) 62 | ) 63 | 64 | avg_score = sum([r["marker_score"] for r in marker_results]) / len(marker_results) 65 | headers = ["Avg score", "Total tables"] 66 | data = [f"{avg_score:.3f}", len(marker_results)] 67 | gemini_results = None 68 | if use_gemini: 69 | with ProcessPoolExecutor(max_workers=max_workers) as executor: 70 | gemini_results = list( 71 | tqdm( 72 | executor.map(update_teds_score, results, repeat("gemini")), desc='Computing Gemini scores', 73 | total=len(results) 74 | ) 75 | ) 76 | avg_gemini_score = sum([r["gemini_score"] for r in gemini_results]) / len(gemini_results) 77 | headers.append("Avg Gemini score") 78 | data.append(f"{avg_gemini_score:.3f}") 79 | 80 | table = tabulate([data], headers=headers, tablefmt="github") 81 | print(table) 82 | print("Avg score computed by comparing marker predicted HTML with original HTML") 83 | 84 | results = { 85 | "marker": marker_results, 86 | "gemini": gemini_results 87 | } 88 | 89 | out_path = Path(result_path) 90 | out_path.mkdir(parents=True, exist_ok=True) 91 | with open(out_path / "table.json", "w+") as f: 92 | json.dump(results, f, indent=2) 93 | 94 | print(f"Results saved to {out_path}.") 95 | 96 | if __name__ == '__main__': 97 | main() ``` -------------------------------------------------------------------------------- /benchmarks/table/scoring.py: -------------------------------------------------------------------------------- ```python 1 | """" 2 | TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD 3 | """ 4 | 5 | import distance 6 | from apted import APTED, Config 7 | from apted.helpers import Tree 8 | from lxml import html 9 | from collections import deque 10 | 11 | def wrap_table_html(table_html:str)->str: 12 | return f'<html><body>{table_html}</body></html>' 13 | 14 | class TableTree(Tree): 15 | def __init__(self, tag, colspan=None, rowspan=None, content=None, *children): 16 | self.tag = tag 17 | self.colspan = colspan 18 | self.rowspan = rowspan 19 | self.content = content 20 | 21 | # Sets self.name and self.children 22 | super().__init__(tag, *children) 23 | 24 | def bracket(self): 25 | """Show tree using brackets notation""" 26 | if self.tag == 'td': 27 | result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \ 28 | (self.tag, self.colspan, self.rowspan, self.content) 29 | else: 30 | result = '"tag": %s' % self.tag 31 | for child in self.children: 32 | result += child.bracket() 33 | return "{{{}}}".format(result) 34 | 35 | class CustomConfig(Config): 36 | @staticmethod 37 | def maximum(*sequences): 38 | return max(map(len, sequences)) 39 | 40 | def normalized_distance(self, *sequences): 41 | return float(distance.levenshtein(*sequences)) / self.maximum(*sequences) 42 | 43 | def rename(self, node1, node2): 44 | if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan): 45 | return 1. 46 | if node1.tag == 'td': 47 | if node1.content or node2.content: 48 | return self.normalized_distance(node1.content, node2.content) 49 | return 0. 50 | 51 | def tokenize(node): 52 | """ 53 | Tokenizes table cells 54 | """ 55 | global __tokens__ 56 | __tokens__.append('<%s>' % node.tag) 57 | if node.text is not None: 58 | __tokens__ += list(node.text) 59 | for n in node.getchildren(): 60 | tokenize(n) 61 | if node.tag != 'unk': 62 | __tokens__.append('</%s>' % node.tag) 63 | if node.tag != 'td' and node.tail is not None: 64 | __tokens__ += list(node.tail) 65 | 66 | def tree_convert_html(node, convert_cell=False, parent=None): 67 | """ 68 | Converts HTML tree to the format required by apted 69 | """ 70 | global __tokens__ 71 | if node.tag == 'td': 72 | if convert_cell: 73 | __tokens__ = [] 74 | tokenize(node) 75 | cell = __tokens__[1:-1].copy() 76 | else: 77 | cell = [] 78 | new_node = TableTree(node.tag, 79 | int(node.attrib.get('colspan', '1')), 80 | int(node.attrib.get('rowspan', '1')), 81 | cell, *deque()) 82 | else: 83 | new_node = TableTree(node.tag, None, None, None, *deque()) 84 | if parent is not None: 85 | parent.children.append(new_node) 86 | if node.tag != 'td': 87 | for n in node.getchildren(): 88 | tree_convert_html(n, convert_cell, new_node) 89 | if parent is None: 90 | return new_node 91 | 92 | def similarity_eval_html(pred, true, structure_only=False): 93 | """ 94 | Computes TEDS score between the prediction and the ground truth of a given samples 95 | """ 96 | pred, true = html.fromstring(pred), html.fromstring(true) 97 | if pred.xpath('body/table') and true.xpath('body/table'): 98 | pred = pred.xpath('body/table')[0] 99 | true = true.xpath('body/table')[0] 100 | n_nodes_pred = len(pred.xpath(".//*")) 101 | n_nodes_true = len(true.xpath(".//*")) 102 | tree_pred = tree_convert_html(pred, convert_cell=not structure_only) 103 | tree_true = tree_convert_html(true, convert_cell=not structure_only) 104 | n_nodes = max(n_nodes_pred, n_nodes_true) 105 | distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance() 106 | return 1.0 - (float(distance) / n_nodes) 107 | else: 108 | return 0.0 109 | 110 | ``` -------------------------------------------------------------------------------- /marker/schema/text/span.py: -------------------------------------------------------------------------------- ```python 1 | import html 2 | import re 3 | from typing import List, Literal, Optional 4 | 5 | from marker.schema import BlockTypes 6 | from marker.schema.blocks import Block 7 | from marker.util import unwrap_math 8 | 9 | 10 | def cleanup_text(full_text): 11 | full_text = re.sub(r"(\n\s){3,}", "\n\n", full_text) 12 | full_text = full_text.replace("\xa0", " ") # Replace non-breaking spaces 13 | return full_text 14 | 15 | 16 | class Span(Block): 17 | block_type: BlockTypes = BlockTypes.Span 18 | block_description: str = "A span of text inside a line." 19 | 20 | text: str 21 | font: str 22 | font_weight: float 23 | font_size: float 24 | minimum_position: int 25 | maximum_position: int 26 | formats: List[ 27 | Literal[ 28 | "plain", 29 | "math", 30 | "chemical", 31 | "bold", 32 | "italic", 33 | "highlight", 34 | "subscript", 35 | "superscript", 36 | "small", 37 | "code", 38 | "underline", 39 | ] 40 | ] 41 | has_superscript: bool = False 42 | has_subscript: bool = False 43 | url: Optional[str] = None 44 | html: Optional[str] = None 45 | 46 | @property 47 | def bold(self): 48 | return "bold" in self.formats 49 | 50 | @property 51 | def italic(self): 52 | return "italic" in self.formats 53 | 54 | @property 55 | def math(self): 56 | return "math" in self.formats 57 | 58 | @property 59 | def highlight(self): 60 | return "highlight" in self.formats 61 | 62 | @property 63 | def superscript(self): 64 | return "superscript" in self.formats 65 | 66 | @property 67 | def subscript(self): 68 | return "subscript" in self.formats 69 | 70 | @property 71 | def small(self): 72 | return "small" in self.formats 73 | 74 | @property 75 | def code(self): 76 | return "code" in self.formats 77 | 78 | @property 79 | def underline(self): 80 | return "underline" in self.formats 81 | 82 | def assemble_html(self, document, child_blocks, parent_structure, block_config): 83 | if self.ignore_for_output: 84 | return "" 85 | 86 | if self.html: 87 | return self.html 88 | 89 | text = self.text 90 | 91 | # Remove trailing newlines 92 | replaced_newline = False 93 | while len(text) > 0 and text[-1] in ["\n", "\r"]: 94 | text = text[:-1] 95 | replaced_newline = True 96 | 97 | # Remove leading newlines 98 | while len(text) > 0 and text[0] in ["\n", "\r"]: 99 | text = text[1:] 100 | 101 | if replaced_newline and not text.endswith("-"): 102 | text += " " 103 | 104 | text = text.replace( 105 | "-\n", "" 106 | ) # Remove hyphenated line breaks from the middle of the span 107 | text = html.escape(text) 108 | text = cleanup_text(text) 109 | 110 | if self.has_superscript: 111 | text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", text) 112 | 113 | # Handle full block superscript 114 | if "<sup>" not in text: 115 | text = f"<sup>{text}</sup>" 116 | 117 | if self.url: 118 | text = f"<a href='{self.url}'>{text}</a>" 119 | 120 | # TODO Support multiple formats 121 | if self.italic: 122 | text = f"<i>{text}</i>" 123 | elif self.bold: 124 | text = f"<b>{text}</b>" 125 | elif self.math: 126 | block_envs = ["split", "align", "gather", "multline"] 127 | if any(f"\\begin{{{env}}}" in text for env in block_envs): 128 | display_mode = "block" 129 | else: 130 | display_mode = "inline" 131 | text = f"<math display='{display_mode}'>{text}</math>" 132 | elif self.highlight: 133 | text = f"<mark>{text}</mark>" 134 | elif self.subscript: 135 | text = f"<sub>{text}</sub>" 136 | elif self.superscript: 137 | text = f"<sup>{text}</sup>" 138 | elif self.underline: 139 | text = f"<u>{text}</u>" 140 | elif self.small: 141 | text = f"<small>{text}</small>" 142 | elif self.code: 143 | text = f"<code>{text}</code>" 144 | 145 | text = unwrap_math(text) 146 | return text 147 | ``` -------------------------------------------------------------------------------- /marker/processors/llm/llm_handwriting.py: -------------------------------------------------------------------------------- ```python 1 | import markdown2 2 | from pydantic import BaseModel 3 | from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData 4 | 5 | from marker.schema import BlockTypes 6 | from marker.schema.document import Document 7 | 8 | from typing import Annotated, List 9 | 10 | 11 | class LLMHandwritingProcessor(BaseLLMSimpleBlockProcessor): 12 | block_types = (BlockTypes.Handwriting, BlockTypes.Text) 13 | handwriting_generation_prompt: Annotated[ 14 | str, 15 | "The prompt to use for OCRing handwriting.", 16 | "Default is a string containing the Gemini prompt." 17 | ] = """You are an expert editor specializing in accurately reproducing text from images. 18 | You will receive an image of a text block. Your task is to generate markdown to properly represent the content of the image. Do not omit any text present in the image - make sure everything is included in the markdown representation. The markdown representation should be as faithful to the original image as possible. 19 | 20 | Formatting should be in markdown, with the following rules: 21 | - * for italics, ** for bold, and ` for inline code. 22 | - Headers should be formatted with #, with one # for the largest header, and up to 6 for the smallest. 23 | - Lists should be formatted with either - or 1. for unordered and ordered lists, respectively. 24 | - Links should be formatted with [text](url). 25 | - Use ``` for code blocks. 26 | - Inline math should be formatted with <math>math expression</math>. 27 | - Display math should be formatted with <math display="block">math expression</math>. 28 | - Values and labels should be extracted from forms, and put into markdown tables, with the labels on the left side, and values on the right. The headers should be "Labels" and "Values". Other text in the form can appear between the tables. 29 | - Tables should be formatted with markdown tables, with the headers bolded. 30 | 31 | **Instructions:** 32 | 1. Carefully examine the provided block image. 33 | 2. Output the markdown representing the content of the image. 34 | """ 35 | 36 | def inference_blocks(self, document: Document) -> List[BlockData]: 37 | blocks = super().inference_blocks(document) 38 | out_blocks = [] 39 | for block_data in blocks: 40 | raw_text = block_data["block"].raw_text(document) 41 | block = block_data["block"] 42 | 43 | # Don't process text blocks that contain lines already 44 | if block.block_type == BlockTypes.Text: 45 | lines = block.contained_blocks(document, (BlockTypes.Line,)) 46 | if len(lines) > 0 or len(raw_text.strip()) > 0: 47 | continue 48 | out_blocks.append(block_data) 49 | return out_blocks 50 | 51 | 52 | def block_prompts(self, document: Document) -> List[PromptData]: 53 | prompt_data = [] 54 | for block_data in self.inference_blocks(document): 55 | block = block_data["block"] 56 | prompt = self.handwriting_generation_prompt 57 | image = self.extract_image(document, block) 58 | 59 | prompt_data.append({ 60 | "prompt": prompt, 61 | "image": image, 62 | "block": block, 63 | "schema": HandwritingSchema, 64 | "page": block_data["page"] 65 | }) 66 | return prompt_data 67 | 68 | def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document): 69 | block = prompt_data["block"] 70 | raw_text = block.raw_text(document) 71 | 72 | if not response or "markdown" not in response: 73 | block.update_metadata(llm_error_count=1) 74 | return 75 | 76 | markdown = response["markdown"] 77 | if len(markdown) < len(raw_text) * .5: 78 | block.update_metadata(llm_error_count=1) 79 | return 80 | 81 | markdown = markdown.strip().lstrip("```markdown").rstrip("```").strip() 82 | block.html = markdown2.markdown(markdown, extras=["tables"]) 83 | 84 | class HandwritingSchema(BaseModel): 85 | markdown: str 86 | ``` -------------------------------------------------------------------------------- /marker/services/azure_openai.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import time 3 | from typing import Annotated, List 4 | 5 | import PIL 6 | from marker.logger import get_logger 7 | from openai import AzureOpenAI, APITimeoutError, RateLimitError 8 | from PIL import Image 9 | from pydantic import BaseModel 10 | 11 | from marker.schema.blocks import Block 12 | from marker.services import BaseService 13 | 14 | logger = get_logger() 15 | 16 | 17 | class AzureOpenAIService(BaseService): 18 | azure_endpoint: Annotated[ 19 | str, "The Azure OpenAI endpoint URL. No trailing slash." 20 | ] = None 21 | azure_api_key: Annotated[ 22 | str, "The API key to use for the Azure OpenAI service." 23 | ] = None 24 | azure_api_version: Annotated[str, "The Azure OpenAI API version to use."] = None 25 | deployment_name: Annotated[ 26 | str, "The deployment name for the Azure OpenAI model." 27 | ] = None 28 | 29 | def process_images(self, images: List[PIL.Image.Image]) -> list: 30 | if isinstance(images, Image.Image): 31 | images = [images] 32 | 33 | return [ 34 | { 35 | "type": "image_url", 36 | "image_url": { 37 | "url": "data:image/webp;base64,{}".format(self.img_to_base64(img)), 38 | }, 39 | } 40 | for img in images 41 | ] 42 | 43 | def __call__( 44 | self, 45 | prompt: str, 46 | image: PIL.Image.Image | List[PIL.Image.Image] | None, 47 | block: Block | None, 48 | response_schema: type[BaseModel], 49 | max_retries: int | None = None, 50 | timeout: int | None = None, 51 | ): 52 | if max_retries is None: 53 | max_retries = self.max_retries 54 | 55 | if timeout is None: 56 | timeout = self.timeout 57 | 58 | client = self.get_client() 59 | image_data = self.format_image_for_llm(image) 60 | 61 | messages = [ 62 | { 63 | "role": "user", 64 | "content": [ 65 | *image_data, 66 | {"type": "text", "text": prompt}, 67 | ], 68 | } 69 | ] 70 | 71 | total_tries = max_retries + 1 72 | for tries in range(1, total_tries + 1): 73 | try: 74 | response = client.beta.chat.completions.parse( 75 | extra_headers={ 76 | "X-Title": "Marker", 77 | "HTTP-Referer": "https://github.com/datalab-to/marker", 78 | }, 79 | model=self.deployment_name, 80 | messages=messages, 81 | timeout=timeout, 82 | response_format=response_schema, 83 | ) 84 | response_text = response.choices[0].message.content 85 | total_tokens = response.usage.total_tokens 86 | if block: 87 | block.update_metadata( 88 | llm_tokens_used=total_tokens, llm_request_count=1 89 | ) 90 | return json.loads(response_text) 91 | except (APITimeoutError, RateLimitError) as e: 92 | # Rate limit exceeded 93 | if tries == total_tries: 94 | # Last attempt failed. Give up 95 | logger.error( 96 | f"Rate limit error: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})" 97 | ) 98 | break 99 | else: 100 | wait_time = tries * self.retry_wait_time 101 | logger.warning( 102 | f"Rate limit error: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})" 103 | ) 104 | time.sleep(wait_time) 105 | except Exception as e: 106 | logger.error(f"Azure OpenAI inference failed: {e}") 107 | break 108 | 109 | return {} 110 | 111 | def get_client(self) -> AzureOpenAI: 112 | return AzureOpenAI( 113 | api_version=self.azure_api_version, 114 | azure_endpoint=self.azure_endpoint, 115 | api_key=self.azure_api_key, 116 | ) 117 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/scorers/clean.py: -------------------------------------------------------------------------------- ```python 1 | import re 2 | import subprocess 3 | import tempfile 4 | from pathlib import Path 5 | 6 | import latex2mathml.converter 7 | 8 | class MarkdownCleaner: 9 | def __init__(self): 10 | pass 11 | 12 | def __call__(self, markdown): 13 | markdown = self.normalize_markdown(markdown) # Use pandoc to normalize 14 | 15 | # Replace math expressions with latexml 16 | pattern = r'(?<!\\)\$(?:\$([^$]+)\$\$|\s*([^$\n]+?)\s*\$)' 17 | markdown = re.sub(pattern, self.standardize_math, markdown) 18 | 19 | # Replace image urls with a generic tag 20 | pattern = r'!\[(.*?)\]\((https?://[^\s\)]+)\)' 21 | markdown = re.sub(pattern, r'![link]', markdown) 22 | 23 | # Clean up stray html tags 24 | markdown = markdown.replace("<br>", "\n") 25 | markdown = re.sub(r"<sub>(.*?)</sub>", r"\1", markdown) 26 | markdown = re.sub(r"<sup>(.*?)</sup>", r"\1", markdown) 27 | markdown = re.sub(r"<span.*?>(.*?)</span>", r"\1", markdown) # Remove span tags and keep content 28 | 29 | # Clean up markdown formatting 30 | markdown = re.sub(r"\s+", " ", markdown) 31 | markdown = re.sub(r"\n+", "\n", markdown) 32 | markdown = re.sub("\\.+", ".", 33 | markdown) # Replace repeated periods with a single period, like in table of contents 34 | markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header 35 | markdown = markdown.encode().decode('unicode-escape', errors="ignore") # Decode unicode characters properly 36 | return markdown.strip().lower() 37 | 38 | @staticmethod 39 | def normalize_markdown(md_text: str) -> str: 40 | with tempfile.TemporaryDirectory() as tmp_dir: 41 | dirpath = Path(tmp_dir) 42 | input_file = dirpath / 'input.md' 43 | input_file.write_text(md_text, encoding='utf-8') 44 | 45 | # Markdown to HTML 46 | html_file = dirpath / 'temp.html' 47 | subprocess.run( 48 | [ 49 | 'pandoc', 50 | str(input_file), 51 | '-f', 'markdown+tex_math_dollars', 52 | '-t', 'html', 53 | '-o', str(html_file), 54 | '--quiet' 55 | ], 56 | check=True 57 | ) 58 | 59 | # HTML to Markdown 60 | output_file = dirpath / 'output.md' 61 | subprocess.run( 62 | [ 63 | 'pandoc', 64 | str(html_file), 65 | '-f', 'html', 66 | '-t', 'markdown+tex_math_dollars', 67 | '-o', str(output_file), 68 | '--quiet' 69 | ], 70 | check=True 71 | ) 72 | 73 | # Read back the normalized Markdown 74 | normalized_md = output_file.read_text(encoding='utf-8') 75 | 76 | return normalized_md 77 | 78 | def standardize_math(self, match): 79 | try: 80 | delim = "$$" if match.group(0).startswith('$$') else "$" 81 | math_content = match.group(1) or match.group(2) 82 | if delim == "$$": 83 | math_content = latex2mathml.converter.convert(math_content) 84 | else: 85 | math_content = self.clean_latex(math_content) 86 | return f'{delim}{math_content}{delim}' 87 | except Exception as e: 88 | print(f"Failed to standardize math expression: {match.group(0)} with error: {e}") 89 | return match.group(0) 90 | 91 | @staticmethod 92 | def clean_latex(latex_str): 93 | latex_str = re.sub(r'\s+', ' ', latex_str.strip()) 94 | for tag in [r'\\text', r'\\mathrm', r'\\mathbf', r'\\textbf']: 95 | latex_str = re.sub(tag + r'\{([^}]+)\}', r'\1', latex_str) 96 | 97 | replacements = { 98 | '\\times': '*', 99 | '\\cdot': '*', 100 | '\\div': '/', 101 | '\\le': '<=', 102 | '\\ge': '>=', 103 | '\\neq': '!=', 104 | '\\to': '\\rightarrow', 105 | } 106 | 107 | for old, new in replacements.items(): 108 | latex_str = latex_str.replace(old, new) 109 | 110 | return latex_str 111 | 112 | 113 | 114 | ``` -------------------------------------------------------------------------------- /marker/utils/gpu.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import subprocess 3 | import torch 4 | 5 | from marker.logger import get_logger 6 | from marker.settings import settings 7 | 8 | logger = get_logger() 9 | 10 | 11 | class GPUManager: 12 | default_gpu_vram: int = 8 13 | 14 | def __init__(self, device_idx: int): 15 | self.device_idx = device_idx 16 | self.original_compute_mode = None 17 | self.mps_server_process = None 18 | 19 | def __enter__(self): 20 | if self.using_cuda(): 21 | self.start_mps_server() 22 | return self 23 | 24 | def __exit__(self, exc_type, exc_val, exc_tb): 25 | if self.using_cuda(): 26 | self.cleanup() 27 | 28 | @staticmethod 29 | def using_cuda(): 30 | return "cuda" in settings.TORCH_DEVICE_MODEL 31 | 32 | def check_cuda_available(self) -> bool: 33 | if not torch.cuda.is_available(): 34 | return False 35 | try: 36 | subprocess.run(["nvidia-smi", "--version"], capture_output=True, check=True) 37 | return True 38 | except (subprocess.CalledProcessError, FileNotFoundError): 39 | return False 40 | 41 | def get_gpu_vram(self): 42 | if not self.using_cuda(): 43 | return self.default_gpu_vram 44 | 45 | try: 46 | result = subprocess.run( 47 | [ 48 | "nvidia-smi", 49 | "--query-gpu=memory.total", 50 | "--format=csv,noheader,nounits", 51 | "-i", 52 | str(self.device_idx), 53 | ], 54 | capture_output=True, 55 | text=True, 56 | check=True, 57 | ) 58 | 59 | vram_mb = int(result.stdout.strip()) 60 | vram_gb = int(vram_mb / 1024) 61 | return vram_gb 62 | 63 | except (subprocess.CalledProcessError, ValueError, FileNotFoundError): 64 | return self.default_gpu_vram 65 | 66 | def start_mps_server(self) -> bool: 67 | if not self.check_cuda_available(): 68 | return False 69 | 70 | try: 71 | # Set MPS environment with chunk-specific directories 72 | env = os.environ.copy() 73 | pipe_dir = f"/tmp/nvidia-mps-{self.device_idx}" 74 | log_dir = f"/tmp/nvidia-log-{self.device_idx}" 75 | env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir 76 | env["CUDA_MPS_LOG_DIRECTORY"] = log_dir 77 | 78 | # Create directories 79 | os.makedirs(pipe_dir, exist_ok=True) 80 | os.makedirs(log_dir, exist_ok=True) 81 | 82 | # Start MPS control daemon 83 | self.mps_server_process = subprocess.Popen( 84 | ["nvidia-cuda-mps-control", "-d"], 85 | env=env, 86 | stdout=subprocess.PIPE, 87 | stderr=subprocess.PIPE, 88 | ) 89 | 90 | logger.info(f"Started NVIDIA MPS server for chunk {self.device_idx}") 91 | return True 92 | except (subprocess.CalledProcessError, FileNotFoundError) as e: 93 | logger.warning( 94 | f"Failed to start MPS server for chunk {self.device_idx}: {e}" 95 | ) 96 | return False 97 | 98 | def stop_mps_server(self) -> None: 99 | try: 100 | # Stop MPS server 101 | env = os.environ.copy() 102 | env["CUDA_MPS_PIPE_DIRECTORY"] = f"/tmp/nvidia-mps-{self.device_idx}" 103 | env["CUDA_MPS_LOG_DIRECTORY"] = f"/tmp/nvidia-log-{self.device_idx}" 104 | 105 | subprocess.run( 106 | ["nvidia-cuda-mps-control"], 107 | input="quit\n", 108 | text=True, 109 | env=env, 110 | timeout=10, 111 | ) 112 | 113 | if self.mps_server_process: 114 | self.mps_server_process.terminate() 115 | try: 116 | self.mps_server_process.wait(timeout=5) 117 | except subprocess.TimeoutExpired: 118 | self.mps_server_process.kill() 119 | self.mps_server_process = None 120 | 121 | logger.info(f"Stopped NVIDIA MPS server for chunk {self.device_idx}") 122 | except Exception as e: 123 | logger.warning( 124 | f"Failed to stop MPS server for chunk {self.device_idx}: {e}" 125 | ) 126 | 127 | def cleanup(self) -> None: 128 | self.stop_mps_server() 129 | ``` -------------------------------------------------------------------------------- /marker/processors/sectionheader.py: -------------------------------------------------------------------------------- ```python 1 | import warnings 2 | from typing import Annotated, Dict, List 3 | 4 | import numpy as np 5 | from sklearn.cluster import KMeans 6 | from sklearn.exceptions import ConvergenceWarning 7 | 8 | from marker.processors import BaseProcessor 9 | from marker.schema import BlockTypes 10 | from marker.schema.document import Document 11 | 12 | # Ignore sklearn warning about not converging 13 | warnings.filterwarnings("ignore", category=ConvergenceWarning) 14 | 15 | 16 | class SectionHeaderProcessor(BaseProcessor): 17 | """ 18 | A processor for recognizing section headers in the document. 19 | """ 20 | block_types = (BlockTypes.SectionHeader, ) 21 | level_count: Annotated[ 22 | int, 23 | "The number of levels to use for headings.", 24 | ] = 4 25 | merge_threshold: Annotated[ 26 | float, 27 | "The minimum gap between headings to consider them part of the same group.", 28 | ] = 0.25 29 | default_level: Annotated[ 30 | int, 31 | "The default heading level to use if no heading level is detected.", 32 | ] = 2 33 | height_tolerance: Annotated[ 34 | float, 35 | "The minimum height of a heading to consider it a heading.", 36 | ] = 0.99 37 | 38 | def __call__(self, document: Document): 39 | line_heights: Dict[int, float] = {} 40 | for page in document.pages: 41 | # Iterate children to grab all section headers 42 | for block in page.children: 43 | if block.block_type not in self.block_types: 44 | continue 45 | if block.structure is not None: 46 | line_heights[block.id] = block.line_height(document) 47 | else: 48 | line_heights[block.id] = 0 49 | block.ignore_for_output = True # Don't output an empty section header 50 | 51 | flat_line_heights = list(line_heights.values()) 52 | heading_ranges = self.bucket_headings(flat_line_heights) 53 | 54 | for page in document.pages: 55 | # Iterate children to grab all section headers 56 | for block in page.children: 57 | if block.block_type not in self.block_types: 58 | continue 59 | block_height = line_heights.get(block.id, 0) 60 | if block_height > 0: 61 | for idx, (min_height, max_height) in enumerate(heading_ranges): 62 | if block_height >= min_height * self.height_tolerance: 63 | block.heading_level = idx + 1 64 | break 65 | 66 | if block.heading_level is None: 67 | block.heading_level = self.default_level 68 | 69 | def bucket_headings(self, line_heights: List[float], num_levels=4): 70 | if len(line_heights) <= self.level_count: 71 | return [] 72 | 73 | data = np.asarray(line_heights).reshape(-1, 1) 74 | labels = KMeans(n_clusters=num_levels, random_state=0, n_init="auto").fit_predict(data) 75 | data_labels = np.concatenate([data, labels.reshape(-1, 1)], axis=1) 76 | data_labels = np.sort(data_labels, axis=0) 77 | 78 | cluster_means = {int(label): float(np.mean(data_labels[data_labels[:, 1] == label, 0])) for label in np.unique(labels)} 79 | label_max = None 80 | label_min = None 81 | heading_ranges = [] 82 | prev_cluster = None 83 | for row in data_labels: 84 | value, label = row 85 | value = float(value) 86 | label = int(label) 87 | if prev_cluster is not None and label != prev_cluster: 88 | prev_cluster_mean = cluster_means[prev_cluster] 89 | cluster_mean = cluster_means[label] 90 | if cluster_mean * self.merge_threshold < prev_cluster_mean: 91 | heading_ranges.append((label_min, label_max)) 92 | label_min = None 93 | label_max = None 94 | 95 | label_min = value if label_min is None else min(label_min, value) 96 | label_max = value if label_max is None else max(label_max, value) 97 | prev_cluster = label 98 | 99 | if label_min is not None: 100 | heading_ranges.append((label_min, label_max)) 101 | 102 | heading_ranges = sorted(heading_ranges, reverse=True) 103 | 104 | return heading_ranges 105 | ``` -------------------------------------------------------------------------------- /marker/config/printer.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Optional 2 | 3 | import click 4 | 5 | from marker.config.crawler import crawler 6 | 7 | 8 | class CustomClickPrinter(click.Command): 9 | def parse_args(self, ctx, args): 10 | display_help = "config" in args and "--help" in args 11 | if display_help: 12 | click.echo( 13 | "Here is a list of all the Builders, Processors, Converters, Providers and Renderers in Marker along with their attributes:" 14 | ) 15 | 16 | # Keep track of shared attributes and their types 17 | shared_attrs = {} 18 | 19 | # First pass: identify shared attributes and verify compatibility 20 | for base_type, base_type_dict in crawler.class_config_map.items(): 21 | for class_name, class_map in base_type_dict.items(): 22 | for attr, (attr_type, formatted_type, default, metadata) in class_map[ 23 | "config" 24 | ].items(): 25 | if attr not in shared_attrs: 26 | shared_attrs[attr] = { 27 | "classes": [], 28 | "type": attr_type, 29 | "is_flag": attr_type in [bool, Optional[bool]] 30 | and not default, 31 | "metadata": metadata, 32 | "default": default, 33 | } 34 | shared_attrs[attr]["classes"].append(class_name) 35 | 36 | # These are the types of attrs that can be set from the command line 37 | attr_types = [ 38 | str, 39 | int, 40 | float, 41 | bool, 42 | Optional[int], 43 | Optional[float], 44 | Optional[str], 45 | ] 46 | 47 | # Add shared attribute options first 48 | for attr, info in shared_attrs.items(): 49 | if info["type"] in attr_types: 50 | ctx.command.params.append( 51 | click.Option( 52 | ["--" + attr], 53 | type=info["type"], 54 | help=" ".join(info["metadata"]) 55 | + f" (Applies to: {', '.join(info['classes'])})", 56 | default=None, # This is important, or it sets all the default keys again in config 57 | is_flag=info["is_flag"], 58 | flag_value=True if info["is_flag"] else None, 59 | ) 60 | ) 61 | 62 | # Second pass: create class-specific options 63 | for base_type, base_type_dict in crawler.class_config_map.items(): 64 | if display_help: 65 | click.echo(f"{base_type}s:") 66 | for class_name, class_map in base_type_dict.items(): 67 | if display_help and class_map["config"]: 68 | click.echo( 69 | f"\n {class_name}: {class_map['class_type'].__doc__ or ''}" 70 | ) 71 | click.echo(" " * 4 + "Attributes:") 72 | for attr, (attr_type, formatted_type, default, metadata) in class_map[ 73 | "config" 74 | ].items(): 75 | class_name_attr = class_name + "_" + attr 76 | 77 | if display_help: 78 | click.echo(" " * 8 + f"{attr} ({formatted_type}):") 79 | click.echo( 80 | "\n".join([f"{' ' * 12}" + desc for desc in metadata]) 81 | ) 82 | 83 | if attr_type in attr_types: 84 | is_flag = attr_type in [bool, Optional[bool]] and not default 85 | 86 | # Only add class-specific options 87 | ctx.command.params.append( 88 | click.Option( 89 | ["--" + class_name_attr, class_name_attr], 90 | type=attr_type, 91 | help=" ".join(metadata), 92 | is_flag=is_flag, 93 | default=None, # This is important, or it sets all the default keys again in config 94 | ) 95 | ) 96 | 97 | if display_help: 98 | ctx.exit() 99 | 100 | super().parse_args(ctx, args) 101 | ``` -------------------------------------------------------------------------------- /marker/processors/text.py: -------------------------------------------------------------------------------- ```python 1 | import math 2 | from typing import Annotated, List 3 | 4 | import regex 5 | 6 | from marker.processors import BaseProcessor 7 | from marker.schema import BlockTypes 8 | from marker.schema.document import Document 9 | from marker.schema.text.line import Line 10 | 11 | 12 | class TextProcessor(BaseProcessor): 13 | """ 14 | A processor for merging text across pages and columns. 15 | """ 16 | 17 | block_types = (BlockTypes.Text, BlockTypes.TextInlineMath) 18 | ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter) 19 | column_gap_ratio: Annotated[ 20 | float, 21 | "The minimum ratio of the page width to the column gap to consider a column break.", 22 | ] = 0.02 23 | 24 | def __init__(self, config): 25 | super().__init__(config) 26 | 27 | def __call__(self, document: Document): 28 | for page in document.pages: 29 | for block in page.contained_blocks(document, self.block_types): 30 | if block.structure is None: 31 | continue 32 | 33 | if not len(block.structure) >= 2: # Skip single lines 34 | continue 35 | 36 | next_block = document.get_next_block(block, self.ignored_block_types) 37 | if next_block is None: # we've reached the end of the document 38 | continue 39 | if next_block.block_type not in self.block_types: 40 | continue # we found a non-text block 41 | if next_block.structure is None: 42 | continue # This is odd though, why do we have text blocks with no structure? 43 | if next_block.ignore_for_output: 44 | continue # skip ignored blocks 45 | 46 | column_gap = block.polygon.width * self.column_gap_ratio 47 | 48 | column_break, page_break = False, False 49 | next_block_starts_indented = True 50 | next_block_in_first_quadrant = False 51 | last_line_is_full_width = False 52 | last_line_is_hyphentated = False 53 | 54 | if next_block.page_id == block.page_id: # block on the same page 55 | # we check for a column break 56 | column_break = math.floor(next_block.polygon.y_start) <= math.ceil( 57 | block.polygon.y_start 58 | ) and next_block.polygon.x_start > ( 59 | block.polygon.x_end + column_gap 60 | ) 61 | else: 62 | page_break = True 63 | next_page = document.get_page(next_block.page_id) 64 | next_block_in_first_quadrant = ( 65 | next_block.polygon.x_start < next_page.polygon.width // 2 66 | ) and (next_block.polygon.y_start < next_page.polygon.height // 2) 67 | 68 | if not (column_break or page_break): 69 | continue 70 | 71 | new_block_lines = next_block.structure_blocks(document) 72 | 73 | # we check for next_block indentation 74 | if len(new_block_lines): 75 | min_x = math.ceil( 76 | min([line.polygon.x_start for line in new_block_lines]) 77 | ) 78 | next_block_starts_indented = ( 79 | new_block_lines[0].polygon.x_start > min_x 80 | ) 81 | 82 | lines: List[Line] = [ 83 | line 84 | for line in block.structure_blocks(document) 85 | if line.polygon.width > 1 86 | ] 87 | if len(lines): 88 | max_x = math.floor(max([line.polygon.x_end for line in lines])) 89 | last_line_is_full_width = lines[-1].polygon.x_end >= max_x 90 | 91 | last_line_is_hyphentated = regex.compile( 92 | r".*[\p{Ll}|\d][-—¬]\s?$", regex.DOTALL 93 | ).match(lines[-1].raw_text(document).strip()) 94 | 95 | if ( 96 | (last_line_is_full_width or last_line_is_hyphentated) 97 | and not next_block_starts_indented 98 | and ((next_block_in_first_quadrant and page_break) or column_break) 99 | ): 100 | block.has_continuation = True 101 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/__init__.py: -------------------------------------------------------------------------------- ```python 1 | import io 2 | import random 3 | import re 4 | from typing import Tuple 5 | 6 | import markdown2 7 | from PIL import Image 8 | from playwright.sync_api import sync_playwright 9 | 10 | from benchmarks.overall.methods.schema import BenchmarkResult 11 | from marker.renderers.markdown import MarkdownRenderer 12 | 13 | 14 | class BaseMethod: 15 | def __init__(self, **kwargs): 16 | for kwarg in kwargs: 17 | if hasattr(self, kwarg): 18 | setattr(self, kwarg, kwargs[kwarg]) 19 | 20 | @staticmethod 21 | def convert_to_md(html: str): 22 | md = MarkdownRenderer() 23 | markdown = md.md_cls.convert(html) 24 | return markdown 25 | 26 | def __call__(self, sample) -> BenchmarkResult: 27 | raise NotImplementedError() 28 | 29 | def render(self, markdown: str): 30 | return self.html_to_image(self.convert_to_html(markdown)) 31 | 32 | @staticmethod 33 | def convert_to_html(md: str): 34 | block_placeholders = [] 35 | inline_placeholders = [] 36 | 37 | # Add placeholders for the math 38 | def block_sub(match): 39 | content = match.group(1) 40 | placeholder = f"1BLOCKMATH{len(block_placeholders)}1" 41 | block_placeholders.append((placeholder, f"$${content}$$")) 42 | return placeholder 43 | 44 | def inline_sub(match): 45 | content = match.group(1) 46 | placeholder = f"1INLINEMATH{len(inline_placeholders)}1" 47 | inline_placeholders.append((placeholder, f"${content}$")) 48 | return placeholder 49 | 50 | md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL) 51 | md = re.sub(r'\$(.*?)\$', inline_sub, md) 52 | 53 | html = markdown2.markdown(md, extras=['tables']) 54 | 55 | # Replace placeholders 56 | for placeholder, math_str in block_placeholders: 57 | html = html.replace(placeholder, math_str) 58 | for placeholder, math_str in inline_placeholders: 59 | html = html.replace(placeholder, math_str) 60 | 61 | return html 62 | 63 | def html_to_image(self, html: str) -> Image.Image: 64 | with sync_playwright() as p: 65 | browser = p.chromium.launch() 66 | page = browser.new_page() 67 | html_str = f""" 68 | <!DOCTYPE html> 69 | <html> 70 | <head> 71 | <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous"> 72 | <!-- The loading of KaTeX is deferred to speed up page rendering --> 73 | <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script> 74 | <!-- To automatically render math in text elements, include the auto-render extension: --> 75 | <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script> 76 | </head> 77 | <body> 78 | {html} 79 | <script> 80 | document.addEventListener("DOMContentLoaded", function() {{ 81 | renderMathInElement(document.body, {{ 82 | delimiters: [ 83 | {{left: '$$', right: '$$', display: true}}, 84 | {{left: '$', right: '$', display: false}} 85 | ], 86 | throwOnError : false 87 | }}); 88 | }}); 89 | </script> 90 | </body> 91 | </html> 92 | """.strip() 93 | page.set_viewport_size({"width": 1200, "height": 800}) 94 | page.set_content(html_str) 95 | page.wait_for_load_state("domcontentloaded") 96 | page.wait_for_timeout(500) # Wait for KaTeX to render 97 | screenshot_bytes = page.screenshot(full_page=True) 98 | browser.close() 99 | 100 | return Image.open(io.BytesIO(screenshot_bytes)) ``` -------------------------------------------------------------------------------- /marker/processors/llm/llm_form.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List 2 | 3 | from pydantic import BaseModel 4 | 5 | from marker.output import json_to_html 6 | from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData 7 | 8 | from marker.schema import BlockTypes 9 | from marker.schema.document import Document 10 | 11 | 12 | class LLMFormProcessor(BaseLLMSimpleBlockProcessor): 13 | block_types = (BlockTypes.Form,) 14 | form_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images. 15 | You will receive an image of a text block and an html representation of the form in the image. 16 | Your task is to correct any errors in the html representation, and format it properly. 17 | Values and labels should appear in html tables, with the labels on the left side, and values on the right. Other text in the form can appear between the tables. Only use the tags `table, p, span, i, b, th, td, tr, and div`. Do not omit any text from the form - make sure everything is included in the html representation. It should be as faithful to the original form as possible. 18 | **Instructions:** 19 | 1. Carefully examine the provided form block image. 20 | 2. Analyze the html representation of the form. 21 | 3. Compare the html representation to the image. 22 | 4. If the html representation is correct, or you cannot read the image properly, then write "No corrections needed." 23 | 5. If the html representation contains errors, generate the corrected html representation. 24 | 6. Output only either the corrected html representation or "No corrections needed." 25 | **Example:** 26 | Input: 27 | ```html 28 | <table> 29 | <tr> 30 | <td>Label 1</td> 31 | <td>Label 2</td> 32 | <td>Label 3</td> 33 | </tr> 34 | <tr> 35 | <td>Value 1</td> 36 | <td>Value 2</td> 37 | <td>Value 3</td> 38 | </tr> 39 | </table> 40 | ``` 41 | Output: 42 | Comparison: The html representation has the labels in the first row and the values in the second row. It should be corrected to have the labels on the left side and the values on the right side. 43 | ```html 44 | <table> 45 | <tr> 46 | <td>Label 1</td> 47 | <td>Value 1</td> 48 | </tr> 49 | <tr> 50 | <td>Label 2</td> 51 | <td>Value 2</td> 52 | </tr> 53 | <tr> 54 | <td>Label 3</td> 55 | <td>Value 3</td> 56 | </tr> 57 | </table> 58 | ``` 59 | **Input:** 60 | ```html 61 | {block_html} 62 | ``` 63 | """ 64 | 65 | def inference_blocks(self, document: Document) -> List[BlockData]: 66 | blocks = super().inference_blocks(document) 67 | out_blocks = [] 68 | for block_data in blocks: 69 | block = block_data["block"] 70 | children = block.contained_blocks(document, (BlockTypes.TableCell,)) 71 | if not children: 72 | continue 73 | out_blocks.append(block_data) 74 | return out_blocks 75 | 76 | 77 | def block_prompts(self, document: Document) -> List[PromptData]: 78 | prompt_data = [] 79 | for block_data in self.inference_blocks(document): 80 | block = block_data["block"] 81 | block_html = json_to_html(block.render(document)) 82 | prompt = self.form_rewriting_prompt.replace("{block_html}", block_html) 83 | image = self.extract_image(document, block) 84 | prompt_data.append({ 85 | "prompt": prompt, 86 | "image": image, 87 | "block": block, 88 | "schema": FormSchema, 89 | "page": block_data["page"] 90 | }) 91 | return prompt_data 92 | 93 | 94 | def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document): 95 | block = prompt_data["block"] 96 | block_html = json_to_html(block.render(document)) 97 | 98 | if not response or "corrected_html" not in response: 99 | block.update_metadata(llm_error_count=1) 100 | return 101 | 102 | corrected_html = response["corrected_html"] 103 | 104 | # The original table is okay 105 | if "no corrections needed" in corrected_html.lower(): 106 | return 107 | 108 | # Potentially a partial response 109 | if len(corrected_html) < len(block_html) * .33: 110 | block.update_metadata(llm_error_count=1) 111 | return 112 | 113 | corrected_html = corrected_html.strip().lstrip("```html").rstrip("```").strip() 114 | block.html = corrected_html 115 | 116 | class FormSchema(BaseModel): 117 | comparison: str 118 | corrected_html: str ``` -------------------------------------------------------------------------------- /marker/renderers/ocr_json.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Annotated, List, Tuple 2 | 3 | from pydantic import BaseModel 4 | 5 | from marker.renderers import BaseRenderer 6 | from marker.schema import BlockTypes 7 | from marker.schema.document import Document 8 | 9 | 10 | class OCRJSONCharOutput(BaseModel): 11 | id: str 12 | block_type: str 13 | text: str 14 | polygon: List[List[float]] 15 | bbox: List[float] 16 | 17 | 18 | class OCRJSONLineOutput(BaseModel): 19 | id: str 20 | block_type: str 21 | html: str 22 | polygon: List[List[float]] 23 | bbox: List[float] 24 | children: List["OCRJSONCharOutput"] | None = None 25 | 26 | 27 | class OCRJSONPageOutput(BaseModel): 28 | id: str 29 | block_type: str 30 | polygon: List[List[float]] 31 | bbox: List[float] 32 | children: List[OCRJSONLineOutput] | None = None 33 | 34 | 35 | class OCRJSONOutput(BaseModel): 36 | children: List[OCRJSONPageOutput] 37 | block_type: str = str(BlockTypes.Document) 38 | metadata: dict | None = None 39 | 40 | 41 | class OCRJSONRenderer(BaseRenderer): 42 | """ 43 | A renderer for OCR JSON output. 44 | """ 45 | 46 | image_blocks: Annotated[ 47 | Tuple[BlockTypes], 48 | "The list of block types to consider as images.", 49 | ] = (BlockTypes.Picture, BlockTypes.Figure) 50 | page_blocks: Annotated[ 51 | Tuple[BlockTypes], 52 | "The list of block types to consider as pages.", 53 | ] = (BlockTypes.Page,) 54 | 55 | def extract_json(self, document: Document) -> List[OCRJSONPageOutput]: 56 | pages = [] 57 | for page in document.pages: 58 | page_equations = [ 59 | b for b in page.children if b.block_type == BlockTypes.Equation 60 | and not b.removed 61 | ] 62 | equation_lines = [] 63 | for equation in page_equations: 64 | if not equation.structure: 65 | continue 66 | 67 | equation_lines += [ 68 | line 69 | for line in equation.structure 70 | if line.block_type == BlockTypes.Line 71 | ] 72 | 73 | page_lines = [ 74 | block 75 | for block in page.children 76 | if block.block_type == BlockTypes.Line 77 | and block.id not in equation_lines 78 | and not block.removed 79 | ] 80 | 81 | lines = [] 82 | for line in page_lines + page_equations: 83 | line_obj = OCRJSONLineOutput( 84 | id=str(line.id), 85 | block_type=str(line.block_type), 86 | html="", 87 | polygon=line.polygon.polygon, 88 | bbox=line.polygon.bbox, 89 | ) 90 | if line in page_equations: 91 | line_obj.html = line.html 92 | else: 93 | line_obj.html = line.formatted_text(document) 94 | spans = ( 95 | [document.get_block(span_id) for span_id in line.structure] 96 | if line.structure 97 | else [] 98 | ) 99 | children = [] 100 | for span in spans: 101 | if not span.structure: 102 | continue 103 | 104 | span_chars = [ 105 | document.get_block(char_id) for char_id in span.structure 106 | ] 107 | children.extend( 108 | [ 109 | OCRJSONCharOutput( 110 | id=str(char.id), 111 | block_type=str(char.block_type), 112 | text=char.text, 113 | polygon=char.polygon.polygon, 114 | bbox=char.polygon.bbox, 115 | ) 116 | for char in span_chars 117 | ] 118 | ) 119 | line_obj.children = children 120 | lines.append(line_obj) 121 | 122 | page = OCRJSONPageOutput( 123 | id=str(page.id), 124 | block_type=str(page.block_type), 125 | polygon=page.polygon.polygon, 126 | bbox=page.polygon.bbox, 127 | children=lines, 128 | ) 129 | pages.append(page) 130 | 131 | return pages 132 | 133 | def __call__(self, document: Document) -> OCRJSONOutput: 134 | return OCRJSONOutput(children=self.extract_json(document), metadata=None) 135 | ``` -------------------------------------------------------------------------------- /marker/services/openai.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import time 3 | from typing import Annotated, List 4 | 5 | import openai 6 | import PIL 7 | from marker.logger import get_logger 8 | from openai import APITimeoutError, RateLimitError 9 | from PIL import Image 10 | from pydantic import BaseModel 11 | 12 | from marker.schema.blocks import Block 13 | from marker.services import BaseService 14 | 15 | logger = get_logger() 16 | 17 | 18 | class OpenAIService(BaseService): 19 | openai_base_url: Annotated[ 20 | str, "The base url to use for OpenAI-like models. No trailing slash." 21 | ] = "https://api.openai.com/v1" 22 | openai_model: Annotated[str, "The model name to use for OpenAI-like model."] = ( 23 | "gpt-4o-mini" 24 | ) 25 | openai_api_key: Annotated[ 26 | str, "The API key to use for the OpenAI-like service." 27 | ] = None 28 | openai_image_format: Annotated[ 29 | str, 30 | "The image format to use for the OpenAI-like service. Use 'png' for better compatability", 31 | ] = "webp" 32 | 33 | def process_images(self, images: List[Image.Image]) -> List[dict]: 34 | """ 35 | Generate the base-64 encoded message to send to an 36 | openAI-compatabile multimodal model. 37 | 38 | Args: 39 | images: Image or list of PIL images to include 40 | format: Format to use for the image; use "png" for better compatability. 41 | 42 | Returns: 43 | A list of OpenAI-compatbile multimodal messages containing the base64-encoded images. 44 | """ 45 | if isinstance(images, Image.Image): 46 | images = [images] 47 | 48 | img_fmt = self.openai_image_format 49 | return [ 50 | { 51 | "type": "image_url", 52 | "image_url": { 53 | "url": "data:image/{};base64,{}".format( 54 | img_fmt, self.img_to_base64(img, format=img_fmt) 55 | ), 56 | }, 57 | } 58 | for img in images 59 | ] 60 | 61 | def __call__( 62 | self, 63 | prompt: str, 64 | image: PIL.Image.Image | List[PIL.Image.Image] | None, 65 | block: Block | None, 66 | response_schema: type[BaseModel], 67 | max_retries: int | None = None, 68 | timeout: int | None = None, 69 | ): 70 | if max_retries is None: 71 | max_retries = self.max_retries 72 | 73 | if timeout is None: 74 | timeout = self.timeout 75 | 76 | client = self.get_client() 77 | image_data = self.format_image_for_llm(image) 78 | 79 | messages = [ 80 | { 81 | "role": "user", 82 | "content": [ 83 | *image_data, 84 | {"type": "text", "text": prompt}, 85 | ], 86 | } 87 | ] 88 | 89 | total_tries = max_retries + 1 90 | for tries in range(1, total_tries + 1): 91 | try: 92 | response = client.beta.chat.completions.parse( 93 | extra_headers={ 94 | "X-Title": "Marker", 95 | "HTTP-Referer": "https://github.com/datalab-to/marker", 96 | }, 97 | model=self.openai_model, 98 | messages=messages, 99 | timeout=timeout, 100 | response_format=response_schema, 101 | ) 102 | response_text = response.choices[0].message.content 103 | total_tokens = response.usage.total_tokens 104 | if block: 105 | block.update_metadata( 106 | llm_tokens_used=total_tokens, llm_request_count=1 107 | ) 108 | return json.loads(response_text) 109 | except (APITimeoutError, RateLimitError) as e: 110 | # Rate limit exceeded 111 | if tries == total_tries: 112 | # Last attempt failed. Give up 113 | logger.error( 114 | f"Rate limit error: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})", 115 | ) 116 | break 117 | else: 118 | wait_time = tries * self.retry_wait_time 119 | logger.warning( 120 | f"Rate limit error: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})", 121 | ) 122 | time.sleep(wait_time) 123 | except Exception as e: 124 | logger.error(f"OpenAI inference failed: {e}") 125 | break 126 | 127 | return {} 128 | 129 | def get_client(self) -> openai.OpenAI: 130 | return openai.OpenAI(api_key=self.openai_api_key, base_url=self.openai_base_url) 131 | ```