This is page 2 of 7. Use http://codebase.md/datalab-to/marker?page={x} to view the full context.
# Directory Structure
```
├── .github
│ ├── ISSUE_TEMPLATE
│ │ ├── breaking-bug-report.md
│ │ ├── feature_request.md
│ │ └── output-bug-report.md
│ └── workflows
│ ├── benchmarks.yml
│ ├── ci.yml
│ ├── cla.yml
│ ├── publish.yml
│ └── scripts.yml
├── .gitignore
├── .pre-commit-config.yaml
├── benchmarks
│ ├── __init__.py
│ ├── overall
│ │ ├── __init__.py
│ │ ├── display
│ │ │ ├── __init__.py
│ │ │ ├── dataset.py
│ │ │ └── table.py
│ │ ├── download
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── llamaparse.py
│ │ │ ├── main.py
│ │ │ ├── mathpix.py
│ │ │ └── mistral.py
│ │ ├── elo.py
│ │ ├── methods
│ │ │ ├── __init__.py
│ │ │ ├── docling.py
│ │ │ ├── gt.py
│ │ │ ├── llamaparse.py
│ │ │ ├── marker.py
│ │ │ ├── mathpix.py
│ │ │ ├── mistral.py
│ │ │ ├── olmocr.py
│ │ │ └── schema.py
│ │ ├── overall.py
│ │ ├── registry.py
│ │ ├── schema.py
│ │ └── scorers
│ │ ├── __init__.py
│ │ ├── clean.py
│ │ ├── heuristic.py
│ │ ├── llm.py
│ │ └── schema.py
│ ├── table
│ │ ├── __init__.py
│ │ ├── gemini.py
│ │ ├── inference.py
│ │ ├── scoring.py
│ │ └── table.py
│ ├── throughput
│ │ ├── __init__.py
│ │ └── main.py
│ └── verify_scores.py
├── chunk_convert.py
├── CLA.md
├── convert_single.py
├── convert.py
├── data
│ ├── .gitignore
│ ├── examples
│ │ ├── json
│ │ │ ├── multicolcnn.json
│ │ │ ├── switch_trans.json
│ │ │ └── thinkpython.json
│ │ └── markdown
│ │ ├── multicolcnn
│ │ │ ├── _page_1_Figure_0.jpeg
│ │ │ ├── _page_2_Picture_0.jpeg
│ │ │ ├── _page_6_Figure_0.jpeg
│ │ │ ├── _page_7_Figure_0.jpeg
│ │ │ ├── multicolcnn_meta.json
│ │ │ └── multicolcnn.md
│ │ ├── switch_transformers
│ │ │ ├── _page_11_Figure_4.jpeg
│ │ │ ├── _page_12_Figure_4.jpeg
│ │ │ ├── _page_13_Figure_2.jpeg
│ │ │ ├── _page_18_Figure_1.jpeg
│ │ │ ├── _page_18_Figure_3.jpeg
│ │ │ ├── _page_2_Figure_3.jpeg
│ │ │ ├── _page_20_Figure_1.jpeg
│ │ │ ├── _page_20_Figure_4.jpeg
│ │ │ ├── _page_27_Figure_1.jpeg
│ │ │ ├── _page_29_Figure_1.jpeg
│ │ │ ├── _page_30_Figure_1.jpeg
│ │ │ ├── _page_31_Figure_3.jpeg
│ │ │ ├── _page_4_Figure_1.jpeg
│ │ │ ├── _page_5_Figure_3.jpeg
│ │ │ ├── switch_trans_meta.json
│ │ │ └── switch_trans.md
│ │ └── thinkpython
│ │ ├── _page_109_Figure_1.jpeg
│ │ ├── _page_115_Figure_1.jpeg
│ │ ├── _page_116_Figure_3.jpeg
│ │ ├── _page_127_Figure_1.jpeg
│ │ ├── _page_128_Figure_1.jpeg
│ │ ├── _page_167_Figure_1.jpeg
│ │ ├── _page_169_Figure_1.jpeg
│ │ ├── _page_173_Figure_1.jpeg
│ │ ├── _page_190_Figure_1.jpeg
│ │ ├── _page_195_Figure_1.jpeg
│ │ ├── _page_205_Figure_1.jpeg
│ │ ├── _page_23_Figure_1.jpeg
│ │ ├── _page_23_Figure_3.jpeg
│ │ ├── _page_230_Figure_1.jpeg
│ │ ├── _page_233_Figure_1.jpeg
│ │ ├── _page_233_Figure_3.jpeg
│ │ ├── _page_234_Figure_1.jpeg
│ │ ├── _page_235_Figure_1.jpeg
│ │ ├── _page_236_Figure_1.jpeg
│ │ ├── _page_236_Figure_3.jpeg
│ │ ├── _page_237_Figure_1.jpeg
│ │ ├── _page_238_Figure_1.jpeg
│ │ ├── _page_46_Figure_1.jpeg
│ │ ├── _page_60_Figure_1.jpeg
│ │ ├── _page_60_Figure_3.jpeg
│ │ ├── _page_67_Figure_1.jpeg
│ │ ├── _page_71_Figure_1.jpeg
│ │ ├── _page_78_Figure_1.jpeg
│ │ ├── _page_85_Figure_1.jpeg
│ │ ├── _page_94_Figure_1.jpeg
│ │ ├── _page_99_Figure_17.jpeg
│ │ ├── _page_99_Figure_178.jpeg
│ │ ├── thinkpython_meta.json
│ │ └── thinkpython.md
│ ├── images
│ │ ├── overall.png
│ │ ├── per_doc.png
│ │ └── table.png
│ └── latex_to_md.sh
├── examples
│ ├── marker_modal_deployment.py
│ └── README.md
├── extraction_app.py
├── LICENSE
├── marker
│ ├── builders
│ │ ├── __init__.py
│ │ ├── document.py
│ │ ├── layout.py
│ │ ├── line.py
│ │ ├── ocr.py
│ │ └── structure.py
│ ├── config
│ │ ├── __init__.py
│ │ ├── crawler.py
│ │ ├── parser.py
│ │ └── printer.py
│ ├── converters
│ │ ├── __init__.py
│ │ ├── extraction.py
│ │ ├── ocr.py
│ │ ├── pdf.py
│ │ └── table.py
│ ├── extractors
│ │ ├── __init__.py
│ │ ├── document.py
│ │ └── page.py
│ ├── logger.py
│ ├── models.py
│ ├── output.py
│ ├── processors
│ │ ├── __init__.py
│ │ ├── blank_page.py
│ │ ├── block_relabel.py
│ │ ├── blockquote.py
│ │ ├── code.py
│ │ ├── debug.py
│ │ ├── document_toc.py
│ │ ├── equation.py
│ │ ├── footnote.py
│ │ ├── ignoretext.py
│ │ ├── line_merge.py
│ │ ├── line_numbers.py
│ │ ├── list.py
│ │ ├── llm
│ │ │ ├── __init__.py
│ │ │ ├── llm_complex.py
│ │ │ ├── llm_equation.py
│ │ │ ├── llm_form.py
│ │ │ ├── llm_handwriting.py
│ │ │ ├── llm_image_description.py
│ │ │ ├── llm_mathblock.py
│ │ │ ├── llm_meta.py
│ │ │ ├── llm_page_correction.py
│ │ │ ├── llm_sectionheader.py
│ │ │ ├── llm_table_merge.py
│ │ │ └── llm_table.py
│ │ ├── order.py
│ │ ├── page_header.py
│ │ ├── reference.py
│ │ ├── sectionheader.py
│ │ ├── table.py
│ │ ├── text.py
│ │ └── util.py
│ ├── providers
│ │ ├── __init__.py
│ │ ├── document.py
│ │ ├── epub.py
│ │ ├── html.py
│ │ ├── image.py
│ │ ├── pdf.py
│ │ ├── powerpoint.py
│ │ ├── registry.py
│ │ ├── spreadsheet.py
│ │ └── utils.py
│ ├── renderers
│ │ ├── __init__.py
│ │ ├── chunk.py
│ │ ├── extraction.py
│ │ ├── html.py
│ │ ├── json.py
│ │ ├── markdown.py
│ │ └── ocr_json.py
│ ├── schema
│ │ ├── __init__.py
│ │ ├── blocks
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── basetable.py
│ │ │ ├── caption.py
│ │ │ ├── code.py
│ │ │ ├── complexregion.py
│ │ │ ├── equation.py
│ │ │ ├── figure.py
│ │ │ ├── footnote.py
│ │ │ ├── form.py
│ │ │ ├── handwriting.py
│ │ │ ├── inlinemath.py
│ │ │ ├── listitem.py
│ │ │ ├── pagefooter.py
│ │ │ ├── pageheader.py
│ │ │ ├── picture.py
│ │ │ ├── reference.py
│ │ │ ├── sectionheader.py
│ │ │ ├── table.py
│ │ │ ├── tablecell.py
│ │ │ ├── text.py
│ │ │ └── toc.py
│ │ ├── document.py
│ │ ├── groups
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── figure.py
│ │ │ ├── list.py
│ │ │ ├── page.py
│ │ │ ├── picture.py
│ │ │ └── table.py
│ │ ├── polygon.py
│ │ ├── registry.py
│ │ └── text
│ │ ├── __init__.py
│ │ ├── char.py
│ │ ├── line.py
│ │ └── span.py
│ ├── scripts
│ │ ├── __init__.py
│ │ ├── chunk_convert.py
│ │ ├── chunk_convert.sh
│ │ ├── common.py
│ │ ├── convert_single.py
│ │ ├── convert.py
│ │ ├── extraction_app.py
│ │ ├── file_to_s3.py
│ │ ├── run_streamlit_app.py
│ │ ├── server.py
│ │ └── streamlit_app.py
│ ├── services
│ │ ├── __init__.py
│ │ ├── azure_openai.py
│ │ ├── claude.py
│ │ ├── gemini.py
│ │ ├── ollama.py
│ │ ├── openai.py
│ │ └── vertex.py
│ ├── settings.py
│ ├── util.py
│ └── utils
│ ├── __init__.py
│ ├── batch.py
│ ├── gpu.py
│ └── image.py
├── marker_app.py
├── marker_server.py
├── MODEL_LICENSE
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── README.md
├── signatures
│ └── version1
│ └── cla.json
├── static
│ └── fonts
│ └── .gitignore
└── tests
├── builders
│ ├── test_blank_page.py
│ ├── test_document_builder.py
│ ├── test_garbled_pdf.py
│ ├── test_layout_replace.py
│ ├── test_ocr_builder.py
│ ├── test_ocr_pipeline.py
│ ├── test_overriding.py
│ ├── test_pdf_links.py
│ ├── test_rotated_bboxes.py
│ ├── test_strip_existing_ocr.py
│ └── test_structure.py
├── config
│ └── test_config.py
├── conftest.py
├── converters
│ ├── test_extraction_converter.py
│ ├── test_ocr_converter.py
│ ├── test_pdf_converter.py
│ └── test_table_converter.py
├── processors
│ ├── test_document_toc_processor.py
│ ├── test_equation_processor.py
│ ├── test_footnote_processor.py
│ ├── test_ignoretext.py
│ ├── test_llm_processors.py
│ ├── test_table_merge.py
│ └── test_table_processor.py
├── providers
│ ├── test_document_providers.py
│ ├── test_image_provider.py
│ └── test_pdf_provider.py
├── renderers
│ ├── test_chunk_renderer.py
│ ├── test_extract_images.py
│ ├── test_html_renderer.py
│ ├── test_json_renderer.py
│ └── test_markdown_renderer.py
├── schema
│ └── groups
│ └── test_list_grouping.py
├── services
│ └── test_service_init.py
└── utils.py
```
# Files
--------------------------------------------------------------------------------
/marker/providers/epub.py:
--------------------------------------------------------------------------------
```python
import base64
import os
import tempfile
from bs4 import BeautifulSoup
from marker.providers.pdf import PdfProvider
css = '''
@page {
size: A4;
margin: 2cm;
}
img {
max-width: 100%;
max-height: 25cm;
object-fit: contain;
margin: 12pt auto;
}
div, p {
max-width: 100%;
word-break: break-word;
font-size: 10pt;
}
table {
width: 100%;
border-collapse: collapse;
break-inside: auto;
font-size: 10pt;
}
tr {
break-inside: avoid;
page-break-inside: avoid;
}
td {
border: 0.75pt solid #000;
padding: 6pt;
}
'''
class EpubProvider(PdfProvider):
def __init__(self, filepath: str, config=None):
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=f".pdf")
self.temp_pdf_path = temp_pdf.name
temp_pdf.close()
# Convert Epub to PDF
try:
self.convert_epub_to_pdf(filepath)
except Exception as e:
raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")
# Initialize the PDF provider with the temp pdf path
super().__init__(self.temp_pdf_path, config)
def __del__(self):
if os.path.exists(self.temp_pdf_path):
os.remove(self.temp_pdf_path)
def convert_epub_to_pdf(self, filepath):
from weasyprint import CSS, HTML
from ebooklib import epub
import ebooklib
ebook = epub.read_epub(filepath)
styles = []
html_content = ""
img_tags = {}
for item in ebook.get_items():
if item.get_type() == ebooklib.ITEM_IMAGE:
img_data = base64.b64encode(item.get_content()).decode("utf-8")
img_tags[item.file_name] = f'data:{item.media_type};base64,{img_data}'
elif item.get_type() == ebooklib.ITEM_STYLE:
styles.append(item.get_content().decode('utf-8'))
for item in ebook.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
html_content += item.get_content().decode("utf-8")
soup = BeautifulSoup(html_content, 'html.parser')
for img in soup.find_all('img'):
src = img.get('src')
if src:
normalized_src = src.replace('../', '')
if normalized_src in img_tags:
img['src'] = img_tags[normalized_src]
for image in soup.find_all('image'):
src = image.get('xlink:href')
if src:
normalized_src = src.replace('../', '')
if normalized_src in img_tags:
image['xlink:href'] = img_tags[normalized_src]
html_content = str(soup)
full_style = ''.join([css]) # + styles)
# we convert the epub to HTML
HTML(string=html_content, base_url=filepath).write_pdf(
self.temp_pdf_path,
stylesheets=[CSS(string=full_style), self.get_font_css()]
)
```
--------------------------------------------------------------------------------
/tests/processors/test_table_processor.py:
--------------------------------------------------------------------------------
```python
from typing import List
import pytest
from marker.renderers.markdown import MarkdownRenderer
from marker.schema import BlockTypes
from marker.processors.table import TableProcessor
from marker.schema.blocks import TableCell
@pytest.mark.config({"page_range": [5]})
def test_table_processor(
pdf_document, recognition_model, table_rec_model, detection_model
):
processor = TableProcessor(recognition_model, table_rec_model, detection_model)
processor(pdf_document)
for block in pdf_document.pages[0].children:
if block.block_type == BlockTypes.Table:
children = block.contained_blocks(pdf_document, (BlockTypes.TableCell,))
assert children
assert len(children) > 0
assert isinstance(children[0], TableCell)
assert len(pdf_document.contained_blocks((BlockTypes.Table,))) == 2
renderer = MarkdownRenderer()
table_output = renderer(pdf_document)
assert "Schedule" in table_output.markdown
@pytest.mark.filename("table_ex.pdf")
@pytest.mark.config({"page_range": [0], "force_ocr": True})
def test_avoid_double_ocr(
pdf_document, recognition_model, table_rec_model, detection_model
):
tables = pdf_document.contained_blocks((BlockTypes.Table,))
lines = tables[0].contained_blocks(pdf_document, (BlockTypes.Line,))
assert len(lines) == 0
processor = TableProcessor(
recognition_model, table_rec_model, detection_model, config={"force_ocr": True}
)
processor(pdf_document)
renderer = MarkdownRenderer()
table_output = renderer(pdf_document)
assert "Participants" in table_output.markdown
@pytest.mark.filename("multicol-blocks.pdf")
@pytest.mark.config({"page_range": [3]})
def test_overlap_blocks(
pdf_document, detection_model, recognition_model, table_rec_model
):
page = pdf_document.pages[0]
assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(
pdf_document
)
processor = TableProcessor(recognition_model, table_rec_model, detection_model)
processor(pdf_document)
assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(
pdf_document
)
@pytest.mark.filename("pres.pdf")
@pytest.mark.config({"page_range": [4]})
def test_ocr_table(pdf_document, recognition_model, table_rec_model, detection_model):
processor = TableProcessor(recognition_model, table_rec_model, detection_model)
processor(pdf_document)
renderer = MarkdownRenderer()
table_output = renderer(pdf_document)
assert "1.2E-38" in table_output.markdown
@pytest.mark.config({"page_range": [11]})
def test_split_rows(pdf_document, recognition_model, table_rec_model, detection_model):
processor = TableProcessor(recognition_model, table_rec_model, detection_model)
processor(pdf_document)
table = pdf_document.contained_blocks((BlockTypes.Table,))[-1]
cells: List[TableCell] = table.contained_blocks(
pdf_document, (BlockTypes.TableCell,)
)
unique_rows = len(set([cell.row_id for cell in cells]))
assert unique_rows == 6
```
--------------------------------------------------------------------------------
/marker/renderers/json.py:
--------------------------------------------------------------------------------
```python
from typing import Annotated, Dict, List, Tuple
from pydantic import BaseModel
from marker.renderers import BaseRenderer
from marker.schema import BlockTypes
from marker.schema.blocks import Block, BlockOutput
from marker.schema.document import Document
from marker.schema.registry import get_block_class
class JSONBlockOutput(BaseModel):
id: str
block_type: str
html: str
polygon: List[List[float]]
bbox: List[float]
children: List["JSONBlockOutput"] | None = None
section_hierarchy: Dict[int, str] | None = None
images: dict | None = None
class JSONOutput(BaseModel):
children: List[JSONBlockOutput]
block_type: str = str(BlockTypes.Document)
metadata: dict
def reformat_section_hierarchy(section_hierarchy):
new_section_hierarchy = {}
for key, value in section_hierarchy.items():
new_section_hierarchy[key] = str(value)
return new_section_hierarchy
class JSONRenderer(BaseRenderer):
"""
A renderer for JSON output.
"""
image_blocks: Annotated[
Tuple[BlockTypes],
"The list of block types to consider as images.",
] = (BlockTypes.Picture, BlockTypes.Figure)
page_blocks: Annotated[
Tuple[BlockTypes],
"The list of block types to consider as pages.",
] = (BlockTypes.Page,)
def extract_json(self, document: Document, block_output: BlockOutput):
cls = get_block_class(block_output.id.block_type)
if cls.__base__ == Block:
html, images = self.extract_block_html(document, block_output)
return JSONBlockOutput(
html=html,
polygon=block_output.polygon.polygon,
bbox=block_output.polygon.bbox,
id=str(block_output.id),
block_type=str(block_output.id.block_type),
images=images,
section_hierarchy=reformat_section_hierarchy(
block_output.section_hierarchy
),
)
else:
children = []
for child in block_output.children:
child_output = self.extract_json(document, child)
children.append(child_output)
return JSONBlockOutput(
html=block_output.html,
polygon=block_output.polygon.polygon,
bbox=block_output.polygon.bbox,
id=str(block_output.id),
block_type=str(block_output.id.block_type),
children=children,
section_hierarchy=reformat_section_hierarchy(
block_output.section_hierarchy
),
)
def __call__(self, document: Document) -> JSONOutput:
document_output = document.render(self.block_config)
json_output = []
for page_output in document_output.children:
json_output.append(self.extract_json(document, page_output))
return JSONOutput(
children=json_output,
metadata=self.generate_document_metadata(document, document_output),
)
```
--------------------------------------------------------------------------------
/marker/processors/llm/llm_image_description.py:
--------------------------------------------------------------------------------
```python
from pydantic import BaseModel
from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData
from marker.schema import BlockTypes
from marker.schema.document import Document
from typing import Annotated, List
class LLMImageDescriptionProcessor(BaseLLMSimpleBlockProcessor):
block_types = (
BlockTypes.Picture,
BlockTypes.Figure,
)
extract_images: Annotated[bool, "Extract images from the document."] = True
image_description_prompt: Annotated[
str,
"The prompt to use for generating image descriptions.",
"Default is a string containing the Gemini prompt.",
] = """You are a document analysis expert who specializes in creating text descriptions for images.
You will receive an image of a picture or figure. Your job will be to create a short description of the image.
**Instructions:**
1. Carefully examine the provided image.
2. Analyze any text that was extracted from within the image.
3. Output a faithful description of the image. Make sure there is enough specific detail to accurately reconstruct the image. If the image is a figure or contains numeric data, include the numeric data in the output.
**Example:**
Input:
```text
"Fruit Preference Survey"
20, 15, 10
Apples, Bananas, Oranges
```
Output:
In this figure, a bar chart titled "Fruit Preference Survey" is showing the number of people who prefer different types of fruits. The x-axis shows the types of fruits, and the y-axis shows the number of people. The bar chart shows that most people prefer apples, followed by bananas and oranges. 20 people prefer apples, 15 people prefer bananas, and 10 people prefer oranges.
**Input:**
```text
{raw_text}
```
"""
def inference_blocks(self, document: Document) -> List[BlockData]:
blocks = super().inference_blocks(document)
if self.extract_images:
return []
return blocks
def block_prompts(self, document: Document) -> List[PromptData]:
prompt_data = []
for block_data in self.inference_blocks(document):
block = block_data["block"]
prompt = self.image_description_prompt.replace(
"{raw_text}", block.raw_text(document)
)
image = self.extract_image(document, block)
prompt_data.append(
{
"prompt": prompt,
"image": image,
"block": block,
"schema": ImageSchema,
"page": block_data["page"],
}
)
return prompt_data
def rewrite_block(
self, response: dict, prompt_data: PromptData, document: Document
):
block = prompt_data["block"]
if not response or "image_description" not in response:
block.update_metadata(llm_error_count=1)
return
image_description = response["image_description"]
if len(image_description) < 10:
block.update_metadata(llm_error_count=1)
return
block.description = image_description
class ImageSchema(BaseModel):
image_description: str
```
--------------------------------------------------------------------------------
/benchmarks/overall/display/table.py:
--------------------------------------------------------------------------------
```python
from pathlib import Path
from typing import Dict, List
import tabulate
from benchmarks.overall.schema import FullResult
def write_table(title: str, rows: list, headers: list, out_path: Path, filename: str):
table = tabulate.tabulate(rows, headers=headers, tablefmt="github")
with open(out_path / filename, "w", encoding="utf-8") as f:
f.write(f"# {title}\n")
f.write(table)
print(title)
print(table)
def print_scores(result: FullResult, out_path: Path, methods: List[str], score_types: List[str], default_score_type="heuristic", default_method="marker"):
document_types = list(result["averages_by_type"][default_method][default_score_type].keys())
headers = ["Document Type"]
for method in methods:
for score_type in score_types:
headers.append(f"{method} {score_type}")
document_rows = [[k] for k in document_types]
for i, doc_type in enumerate(document_types):
for method in methods:
for score_type in score_types:
avg_score = sum(result["averages_by_type"][method][score_type][doc_type]) / max(1, len(result["averages_by_type"][method][score_type][doc_type]))
document_rows[i].append(avg_score)
write_table("Document Types", document_rows, headers, out_path, "document_types.md")
headers = ["Block Type"]
block_types = list(result["averages_by_block_type"][default_method][default_score_type].keys()) # all possible blocks
block_score_types = list(result["averages_by_block_type"][default_method].keys())
for method in methods:
for score_type in block_score_types:
headers.append(f"{method} {score_type}")
block_rows = [[k] for k in block_types]
for i, block_type in enumerate(block_types):
for method in methods:
for score_type in block_score_types:
avg_score = sum(result["averages_by_block_type"][method][score_type][block_type]) / max(1, len(result["averages_by_block_type"][method][score_type][block_type]))
block_rows[i].append(avg_score)
write_table("Block types", block_rows, headers, out_path, "block_types.md")
headers = ["Method", "Avg Time"] + score_types
inference_rows = [[k] for k in methods]
all_raw_scores = [result["scores"][i] for i in result["scores"]]
for i, method in enumerate(methods):
avg_time = sum(result["average_times"][method]) / max(1, len(result["average_times"][method]))
inference_rows[i].append(avg_time)
for score_type in score_types:
scores_lst = []
for ar in all_raw_scores:
try:
# Sometimes a few llm scores are missing
scores_lst.append(ar[method][score_type]["score"])
except KeyError:
continue
avg_score = sum(scores_lst) / max(1, len(scores_lst))
inference_rows[i].append(avg_score)
write_table("Overall Results", inference_rows, headers, out_path, "overall.md")
print("Scores computed by aligning ground truth markdown blocks with predicted markdown for each method. The scores are 0-100 based on edit distance.")
```
--------------------------------------------------------------------------------
/marker/renderers/chunk.py:
--------------------------------------------------------------------------------
```python
import html
from typing import List, Dict
from bs4 import BeautifulSoup
from pydantic import BaseModel
from marker.renderers.json import JSONRenderer, JSONBlockOutput
from marker.schema.document import Document
class FlatBlockOutput(BaseModel):
id: str
block_type: str
html: str
page: int
polygon: List[List[float]]
bbox: List[float]
section_hierarchy: Dict[int, str] | None = None
images: dict | None = None
class ChunkOutput(BaseModel):
blocks: List[FlatBlockOutput]
page_info: Dict[int, dict]
metadata: dict
def collect_images(block: JSONBlockOutput) -> dict[str, str]:
if not getattr(block, "children", None):
return block.images or {}
else:
images = block.images or {}
for child_block in block.children:
images.update(collect_images(child_block))
return images
def assemble_html_with_images(block: JSONBlockOutput, image_blocks: set[str]) -> str:
if not getattr(block, "children", None):
if block.block_type in image_blocks:
return f"<p>{block.html}<img src='{block.id}'></p>"
else:
return block.html
child_html = [assemble_html_with_images(child, image_blocks) for child in block.children]
child_ids = [child.id for child in block.children]
soup = BeautifulSoup(block.html, "html.parser")
content_refs = soup.find_all("content-ref")
for ref in content_refs:
src_id = ref.attrs["src"]
if src_id in child_ids:
ref.replace_with(child_html[child_ids.index(src_id)])
return html.unescape(str(soup))
def json_to_chunks(
block: JSONBlockOutput, image_blocks: set[str], page_id: int=0) -> FlatBlockOutput | List[FlatBlockOutput]:
if block.block_type == "Page":
children = block.children
page_id = int(block.id.split("/")[-1])
return [json_to_chunks(child, image_blocks, page_id=page_id) for child in children]
else:
return FlatBlockOutput(
id=block.id,
block_type=block.block_type,
html=assemble_html_with_images(block, image_blocks),
page=page_id,
polygon=block.polygon,
bbox=block.bbox,
section_hierarchy=block.section_hierarchy,
images=collect_images(block),
)
class ChunkRenderer(JSONRenderer):
def __call__(self, document: Document) -> ChunkOutput:
document_output = document.render(self.block_config)
json_output = []
for page_output in document_output.children:
json_output.append(self.extract_json(document, page_output))
# This will get the top-level blocks from every page
chunk_output = []
for item in json_output:
chunks = json_to_chunks(item, set([str(block) for block in self.image_blocks]))
chunk_output.extend(chunks)
page_info = {
page.page_id: {"bbox": page.polygon.bbox, "polygon": page.polygon.polygon}
for page in document.pages
}
return ChunkOutput(
blocks=chunk_output,
page_info=page_info,
metadata=self.generate_document_metadata(document, document_output),
)
```
--------------------------------------------------------------------------------
/marker/schema/document.py:
--------------------------------------------------------------------------------
```python
from __future__ import annotations
from typing import List, Sequence, Optional
from pydantic import BaseModel
from marker.schema import BlockTypes
from marker.schema.blocks import Block, BlockId, BlockOutput
from marker.schema.groups.page import PageGroup
class DocumentOutput(BaseModel):
children: List[BlockOutput]
html: str
block_type: BlockTypes = BlockTypes.Document
class TocItem(BaseModel):
title: str
heading_level: int
page_id: int
polygon: List[List[float]]
class Document(BaseModel):
filepath: str
pages: List[PageGroup]
block_type: BlockTypes = BlockTypes.Document
table_of_contents: List[TocItem] | None = None
debug_data_path: str | None = None # Path that debug data was saved to
def get_block(self, block_id: BlockId):
page = self.get_page(block_id.page_id)
block = page.get_block(block_id)
if block:
return block
return None
def get_page(self, page_id):
for page in self.pages:
if page.page_id == page_id:
return page
return None
def get_next_block(
self, block: Block, ignored_block_types: List[BlockTypes] = None
):
if ignored_block_types is None:
ignored_block_types = []
next_block = None
# Try to find the next block in the current page
page = self.get_page(block.page_id)
next_block = page.get_next_block(block, ignored_block_types)
if next_block:
return next_block
# If no block found, search subsequent pages
for page in self.pages[self.pages.index(page) + 1 :]:
next_block = page.get_next_block(None, ignored_block_types)
if next_block:
return next_block
return None
def get_next_page(self, page: PageGroup):
page_idx = self.pages.index(page)
if page_idx + 1 < len(self.pages):
return self.pages[page_idx + 1]
return None
def get_prev_block(self, block: Block):
page = self.get_page(block.page_id)
prev_block = page.get_prev_block(block)
if prev_block:
return prev_block
prev_page = self.get_prev_page(page)
if not prev_page:
return None
return prev_page.get_block(prev_page.structure[-1])
def get_prev_page(self, page: PageGroup):
page_idx = self.pages.index(page)
if page_idx > 0:
return self.pages[page_idx - 1]
return None
def assemble_html(
self, child_blocks: List[Block], block_config: Optional[dict] = None
):
template = ""
for c in child_blocks:
template += f"<content-ref src='{c.id}'></content-ref>"
return template
def render(self, block_config: Optional[dict] = None):
child_content = []
section_hierarchy = None
for page in self.pages:
rendered = page.render(self, None, section_hierarchy, block_config)
section_hierarchy = rendered.section_hierarchy.copy()
child_content.append(rendered)
return DocumentOutput(
children=child_content,
html=self.assemble_html(child_content, block_config),
)
def contained_blocks(self, block_types: Sequence[BlockTypes] = None) -> List[Block]:
blocks = []
for page in self.pages:
blocks += page.contained_blocks(self, block_types)
return blocks
```
--------------------------------------------------------------------------------
/benchmarks/overall/scorers/heuristic.py:
--------------------------------------------------------------------------------
```python
from typing import List
from rapidfuzz import fuzz
from benchmarks.overall.scorers.clean import MarkdownCleaner
from benchmarks.overall.scorers.schema import BlockScores
from benchmarks.overall.scorers import BaseScorer
class HeuristicScorer(BaseScorer):
def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
if not method_markdown:
return {
"score": 0,
"specific_scores": {
"order": 0,
"by_block": [0] * len(gt_markdown)
}
}
# Standardize inputs
gt_markdown = [self.clean_input(block) for block in gt_markdown]
method_markdown = self.clean_input(method_markdown)
alignments = self.find_fuzzy_alignments(method_markdown, gt_markdown)
scores = [alignment["score"] for alignment in alignments]
# Find order score
orders = [alignment["start"] for alignment in alignments]
correct_order = list(range(len(gt_markdown)))
actual_order = sorted(range(len(gt_markdown)), key=lambda x: orders[x])
order_score = self.kendall_tau(correct_order, actual_order)
# Weight score by sequence length
gt_weights = [len(g) for g in gt_markdown]
weighted_scores = [score * weight for score, weight in zip(scores, gt_weights)]
# Weight the score by sequence length
overall_score = sum(weighted_scores) / max(1, sum(gt_weights))
overall_score = overall_score * 0.8 + order_score * 0.2
return {
"score": overall_score,
"specific_scores": {
"order": order_score,
"by_block": scores
},
}
@staticmethod
def kendall_tau(correct_order: List[int], actual_order: List[int]) -> float:
n = len(correct_order)
concordant = 0
discordant = 0
if n <= 1:
return 100
for i in range(n):
for j in range(i + 1, n):
correct_sign = correct_order[i] - correct_order[j]
actual_sign = actual_order[i] - actual_order[j]
if (correct_sign > 0 and actual_sign > 0) or (correct_sign < 0 and actual_sign < 0):
concordant += 1
elif (correct_sign < 0 and actual_sign > 0) or (correct_sign > 0 and actual_sign < 0):
discordant += 1
total_pairs = (n * (n - 1)) // 2
tau = (concordant - discordant) / total_pairs
tau = (tau + 1) / 2 # 0-1 scale
return tau * 100 # 0-100 scale
@staticmethod
def find_fuzzy_alignments(
main_string: str,
substrings: List[str],
threshold: int = 70
) -> List[dict]:
alignments = []
for idx, substr in enumerate(substrings):
result = fuzz.partial_ratio_alignment(substr, main_string, score_cutoff=threshold)
score = 0
dest_start = 0
dest_end = 0
if result:
score = result.score
dest_start = result.dest_start
dest_end = result.dest_end
alignments.append({
"string": substr,
"start": dest_start,
"end": dest_end,
"score": score,
"idx": idx
})
return alignments
@staticmethod
def clean_input(md: str):
cleaner = MarkdownCleaner()
return cleaner(md)
```
--------------------------------------------------------------------------------
/marker/output.py:
--------------------------------------------------------------------------------
```python
import json
import os
from bs4 import BeautifulSoup, Tag
from pydantic import BaseModel
from PIL import Image
from marker.renderers.extraction import ExtractionOutput
from marker.renderers.html import HTMLOutput
from marker.renderers.json import JSONOutput, JSONBlockOutput
from marker.renderers.markdown import MarkdownOutput
from marker.renderers.ocr_json import OCRJSONOutput
from marker.schema.blocks import BlockOutput
from marker.settings import settings
def unwrap_outer_tag(html: str):
soup = BeautifulSoup(html, "html.parser")
contents = list(soup.contents)
if len(contents) == 1 and isinstance(contents[0], Tag) and contents[0].name == "p":
# Unwrap the p tag
soup.p.unwrap()
return str(soup)
def json_to_html(block: JSONBlockOutput | BlockOutput):
# Utility function to take in json block output and give html for the block.
if not getattr(block, "children", None):
return block.html
else:
child_html = [json_to_html(child) for child in block.children]
child_ids = [child.id for child in block.children]
soup = BeautifulSoup(block.html, "html.parser")
content_refs = soup.find_all("content-ref")
for ref in content_refs:
src_id = ref.attrs["src"]
if src_id in child_ids:
child_soup = BeautifulSoup(
child_html[child_ids.index(src_id)], "html.parser"
)
ref.replace_with(child_soup)
return str(soup)
def output_exists(output_dir: str, fname_base: str):
exts = ["md", "html", "json"]
for ext in exts:
if os.path.exists(os.path.join(output_dir, f"{fname_base}.{ext}")):
return True
return False
def text_from_rendered(rendered: BaseModel):
from marker.renderers.chunk import ChunkOutput # Has an import from this file
if isinstance(rendered, MarkdownOutput):
return rendered.markdown, "md", rendered.images
elif isinstance(rendered, HTMLOutput):
return rendered.html, "html", rendered.images
elif isinstance(rendered, JSONOutput):
return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
elif isinstance(rendered, ChunkOutput):
return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
elif isinstance(rendered, OCRJSONOutput):
return rendered.model_dump_json(exclude=["metadata"], indent=2), "json", {}
elif isinstance(rendered, ExtractionOutput):
return rendered.document_json, "json", {}
else:
raise ValueError("Invalid output type")
def convert_if_not_rgb(image: Image.Image) -> Image.Image:
if image.mode != "RGB":
image = image.convert("RGB")
return image
def save_output(rendered: BaseModel, output_dir: str, fname_base: str):
text, ext, images = text_from_rendered(rendered)
text = text.encode(settings.OUTPUT_ENCODING, errors="replace").decode(
settings.OUTPUT_ENCODING
)
with open(
os.path.join(output_dir, f"{fname_base}.{ext}"),
"w+",
encoding=settings.OUTPUT_ENCODING,
) as f:
f.write(text)
with open(
os.path.join(output_dir, f"{fname_base}_meta.json"),
"w+",
encoding=settings.OUTPUT_ENCODING,
) as f:
f.write(json.dumps(rendered.metadata, indent=2))
for img_name, img in images.items():
img = convert_if_not_rgb(img) # RGBA images can't save as JPG
img.save(os.path.join(output_dir, img_name), settings.OUTPUT_IMAGE_FORMAT)
```
--------------------------------------------------------------------------------
/marker/processors/block_relabel.py:
--------------------------------------------------------------------------------
```python
from copy import deepcopy
from typing import Annotated
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import BlockId
from marker.schema.document import Document
from marker.schema.registry import get_block_class
from marker.logger import get_logger
logger = get_logger()
class BlockRelabelProcessor(BaseProcessor):
"""
A processor to heuristically relabel blocks based on a confidence threshold.
Each rule in the relabel string maps an original block label to a new one
if the confidence exceeds a given threshold.
"""
block_relabel_str: Annotated[
str,
"Comma-separated relabeling rules in the format '<original_label>:<new_label>:<confidence_threshold>'.",
"Each rule defines how blocks of a certain type should be relabeled when the confidence exceeds the threshold.",
"Example: 'Table:Picture:0.85,Form:Picture:0.9'"
] = ""
def __init__(self, config=None):
super().__init__(config)
self.block_relabel_map = {}
if not self.block_relabel_str:
return
for i, block_config_str in enumerate(self.block_relabel_str.split(',')):
block_config_str = block_config_str.strip()
if not block_config_str:
continue # Skip empty segments
try:
parts = block_config_str.split(':')
if len(parts) != 3:
raise ValueError(f"Expected 3 parts, got {len(parts)}")
block_label, block_relabel, confidence_str = parts
confidence_thresh = float(confidence_str)
block_type = BlockTypes[block_label]
relabel_block_type = BlockTypes[block_relabel]
self.block_relabel_map[block_type] = (
confidence_thresh,
relabel_block_type
)
except Exception as e:
logger.warning(f"Failed to parse relabel rule '{block_config_str}' at index {i}: {e}. Expected format is <original_label>:<new_label>:<confidence_threshold>")
def __call__(self, document: Document):
if len(self.block_relabel_map) == 0:
return
for page in document.pages:
for block in page.structure_blocks(document):
if block.block_type not in self.block_relabel_map:
continue
block_id = BlockId(page_id=page.page_id, block_id=block.block_id, block_type=block.block_type)
confidence_thresh, relabel_block_type = self.block_relabel_map[block.block_type]
confidence = block.top_k.get(block.block_type)
if confidence > confidence_thresh:
logger.debug(f"Skipping relabel for {block_id}; Confidence: {confidence} > Confidence Threshold {confidence_thresh} for re-labelling")
continue
new_block_cls = get_block_class(relabel_block_type)
new_block = new_block_cls(
polygon=deepcopy(block.polygon),
page_id=block.page_id,
structure=deepcopy(block.structure),
text_extraction_method=block.text_extraction_method,
source="heuristics",
top_k=block.top_k,
metadata=block.metadata
)
page.replace_block(block, new_block)
logger.debug(f"Relabelled {block_id} to {relabel_block_type}")
```
--------------------------------------------------------------------------------
/marker/providers/spreadsheet.py:
--------------------------------------------------------------------------------
```python
import os
import tempfile
from marker.providers.pdf import PdfProvider
css = '''
@page {
size: A4 landscape;
margin: 1.5cm;
}
table {
width: 100%;
border-collapse: collapse;
break-inside: auto;
font-size: 10pt;
}
tr {
break-inside: avoid;
page-break-inside: avoid;
}
td {
border: 0.75pt solid #000;
padding: 6pt;
}
'''
class SpreadSheetProvider(PdfProvider):
def __init__(self, filepath: str, config=None):
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=f".pdf")
self.temp_pdf_path = temp_pdf.name
temp_pdf.close()
# Convert XLSX to PDF
try:
self.convert_xlsx_to_pdf(filepath)
except Exception as e:
raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")
# Initialize the PDF provider with the temp pdf path
super().__init__(self.temp_pdf_path, config)
def __del__(self):
if os.path.exists(self.temp_pdf_path):
os.remove(self.temp_pdf_path)
def convert_xlsx_to_pdf(self, filepath: str):
from weasyprint import CSS, HTML
from openpyxl import load_workbook
html = ""
workbook = load_workbook(filepath)
if workbook is not None:
for sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
html += f'<div><h1>{sheet_name}</h1>' + self._excel_to_html_table(sheet) + '</div>'
else:
raise ValueError("Invalid XLSX file")
# We convert the HTML into a PDF
HTML(string=html).write_pdf(
self.temp_pdf_path,
stylesheets=[CSS(string=css), self.get_font_css()]
)
@staticmethod
def _get_merged_cell_ranges(sheet):
merged_info = {}
for merged_range in sheet.merged_cells.ranges:
min_col, min_row, max_col, max_row = merged_range.bounds
merged_info[(min_row, min_col)] = {
'rowspan': max_row - min_row + 1,
'colspan': max_col - min_col + 1,
'range': merged_range
}
return merged_info
def _excel_to_html_table(self, sheet):
merged_cells = self._get_merged_cell_ranges(sheet)
html = f'<table>'
# Track cells we should skip due to being part of a merge range
skip_cells = set()
for row_idx, row in enumerate(sheet.rows, 1):
html += '<tr>'
for col_idx, cell in enumerate(row, 1):
if (row_idx, col_idx) in skip_cells:
continue
# Check if this cell is the start of a merged range
merge_info = merged_cells.get((row_idx, col_idx))
if merge_info:
# Add cells to skip
for r in range(row_idx, row_idx + merge_info['rowspan']):
for c in range(col_idx, col_idx + merge_info['colspan']):
if (r, c) != (row_idx, col_idx):
skip_cells.add((r, c))
# Add merged cell with rowspan/colspan
value = cell.value if cell.value is not None else ''
html += f'<td rowspan="{merge_info["rowspan"]}" colspan="{merge_info["colspan"]}">{value}'
else:
# Regular cell
value = cell.value if cell.value is not None else ''
html += f'<td>{value}'
html += '</td>'
html += '</tr>'
html += '</table>'
return html
```
--------------------------------------------------------------------------------
/marker/processors/ignoretext.py:
--------------------------------------------------------------------------------
```python
import re
from collections import Counter
from itertools import groupby
from typing import Annotated, List
from rapidfuzz import fuzz
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Block
from marker.schema.document import Document
class IgnoreTextProcessor(BaseProcessor):
"""
A processor for identifying and ignoring common text blocks in a document.
These blocks often represent repetitive or non-essential elements, such as headers, footers, or page numbers.
"""
block_types = (
BlockTypes.Text, BlockTypes.SectionHeader,
BlockTypes.TextInlineMath
)
common_element_threshold: Annotated[
float,
"The minimum ratio of pages a text block must appear on to be considered a common element.",
"Blocks that meet or exceed this threshold are marked as common elements.",
] = 0.2
common_element_min_blocks: Annotated[
int,
"The minimum number of occurrences of a text block within a document to consider it a common element.",
"This ensures that rare blocks are not mistakenly flagged.",
] = 3
max_streak: Annotated[
int,
"The maximum number of consecutive occurrences of a text block allowed before it is classified as a common element.",
"Helps to identify patterns like repeated headers or footers.",
] = 3
text_match_threshold: Annotated[
int,
"The minimum fuzzy match score (0-100) required to classify a text block as similar to a common element.",
"Higher values enforce stricter matching.",
] = 90
def __call__(self, document: Document):
first_blocks = []
last_blocks = []
for page in document.pages:
initial_block = None
last_block = None
for block in page.contained_blocks(document, self.block_types):
if block.structure is not None:
if initial_block is None:
initial_block = block
last_block = block
if initial_block is not None:
first_blocks.append(initial_block)
if last_block is not None:
last_blocks.append(last_block)
self.filter_common_elements(document, first_blocks)
self.filter_common_elements(document, last_blocks)
@staticmethod
def clean_text(text):
text = text.replace("\n", "").strip()
text = re.sub(r"^\d+\s*", "", text) # remove numbers at the start of the line
text = re.sub(r"\s*\d+$", "", text) # remove numbers at the end of the line
return text
def filter_common_elements(self, document, blocks: List[Block]):
# We can't filter if we don't have enough pages to find common elements
if len(blocks) < self.common_element_min_blocks:
return
text = [self.clean_text(b.raw_text(document)) for b in blocks]
streaks = {}
for key, group in groupby(text):
streaks[key] = max(streaks.get(key, 0), len(list(group)))
counter = Counter(text)
common = [
k for k, v in counter.items()
if (v >= len(blocks) * self.common_element_threshold or streaks[k] >= self.max_streak)
and v > self.common_element_min_blocks
]
if len(common) == 0:
return
for t, b in zip(text, blocks):
# Check against all common elements
if any(fuzz.ratio(t, common_element) > self.text_match_threshold for common_element in common):
b.ignore_for_output = True
```
--------------------------------------------------------------------------------
/marker/processors/llm/llm_complex.py:
--------------------------------------------------------------------------------
```python
from typing import List
import markdown2
from pydantic import BaseModel
from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
class LLMComplexRegionProcessor(BaseLLMSimpleBlockProcessor):
block_types = (BlockTypes.ComplexRegion,)
complex_region_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
You will receive an image of a text block and the text that can be extracted from the image.
Your task is to generate markdown to properly represent the content of the image. Do not omit any text present in the image - make sure everything is included in the markdown representation. The markdown representation should be as faithful to the original image as possible.
Formatting should be in markdown, with the following rules:
- * for italics, ** for bold, and ` for inline code.
- Use <sup>...</sup> for superscripts.
- Headers should be formatted with #, with one # for the largest header, and up to 6 for the smallest.
- Lists should be formatted with either - or 1. for unordered and ordered lists, respectively.
- Links should be formatted with [text](url).
- Use ``` for code blocks.
- Inline math should be formatted with <math>math expression</math>.
- Display math should be formatted with <math display="block">math expression</math>.
- Values and labels should be extracted from forms, and put into markdown tables, with the labels on the left side, and values on the right. The headers should be "Labels" and "Values". Other text in the form can appear between the tables.
- Tables should be formatted with markdown tables, with the headers bolded.
**Instructions:**
1. Carefully examine the provided block image.
2. Analyze the existing text representation.
3. Generate the markdown representation of the content in the image.
**Example:**
Input:
```text
Table 1: Car Sales
```
Output:
```markdown
## Table 1: Car Sales
| Car | Sales |
| --- | --- |
| Honda | 100 |
| Toyota | 200 |
```
**Input:**
```text
{extracted_text}
```
"""
def block_prompts(self, document: Document) -> List[PromptData]:
prompt_data = []
for block in self.inference_blocks(document):
text = block["block"].raw_text(document)
prompt = self.complex_region_prompt.replace("{extracted_text}", text)
image = self.extract_image(document, block["block"])
prompt_data.append({
"prompt": prompt,
"image": image,
"block": block["block"],
"schema": ComplexSchema,
"page": block["page"]
})
return prompt_data
def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
block = prompt_data["block"]
text = block.raw_text(document)
if not response or "corrected_markdown" not in response:
block.update_metadata(llm_error_count=1)
return
corrected_markdown = response["corrected_markdown"]
# The original table is okay
if "no corrections" in corrected_markdown.lower():
return
# Potentially a partial response
if len(corrected_markdown) < len(text) * .5:
block.update_metadata(llm_error_count=1)
return
# Convert LLM markdown to html
corrected_markdown = corrected_markdown.strip().lstrip("```markdown").rstrip("```").strip()
block.html = markdown2.markdown(corrected_markdown, extras=["tables"])
class ComplexSchema(BaseModel):
corrected_markdown: str
```
--------------------------------------------------------------------------------
/tests/converters/test_pdf_converter.py:
--------------------------------------------------------------------------------
```python
import io
import pytest
from marker.converters.pdf import PdfConverter
from marker.renderers.markdown import MarkdownOutput
@pytest.mark.output_format("markdown")
@pytest.mark.config({"page_range": [0, 1, 2, 3, 7], "disable_ocr": True})
def test_pdf_converter(pdf_converter: PdfConverter, temp_doc):
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
markdown = markdown_output.markdown
# Basic assertions
assert len(markdown) > 0
assert "# Subspace Adversarial Training" in markdown
# Some assertions for line joining across pages
assert (
"AT solutions. However, these methods highly rely on specifically" in markdown
) # pgs: 1-2
assert (
"(with adversarial perturbations), which harms natural accuracy, " in markdown
) # pgs: 3-4
# Some assertions for line joining across columns
assert "remain similar across a wide range of choices." in markdown # pg: 2
assert "a new scheme for designing more robust and efficient" in markdown # pg: 8
@pytest.mark.filename("manual.epub")
@pytest.mark.config({"page_range": [0]})
def test_epub_converter(pdf_converter: PdfConverter, temp_doc):
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
markdown = markdown_output.markdown
# Basic assertions
assert "Simple Sabotage Field Manual" in markdown
@pytest.mark.filename("single_sheet.xlsx")
@pytest.mark.config({"page_range": [0]})
def test_xlsx_converter(pdf_converter: PdfConverter, temp_doc):
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
markdown = markdown_output.markdown
# Basic assertions
assert "four" in markdown
@pytest.mark.filename("china.html")
@pytest.mark.config({"page_range": [10]})
def test_html_converter(pdf_converter: PdfConverter, temp_doc):
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
markdown = markdown_output.markdown
# Basic assertions
assert "Republic of China" in markdown
@pytest.mark.filename("gatsby.docx")
@pytest.mark.config({"page_range": [0]})
def test_docx_converter(pdf_converter: PdfConverter, temp_doc):
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
markdown = markdown_output.markdown
# Basic assertions
assert "The Decline of the American Dream in the 1920s" in markdown
@pytest.mark.filename("lambda.pptx")
@pytest.mark.config({"page_range": [0]})
def test_pptx_converter(pdf_converter: PdfConverter, temp_doc):
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
markdown = markdown_output.markdown
# Basic assertions
assert "Adam Doupé" in markdown
@pytest.mark.output_format("markdown")
@pytest.mark.config({"page_range": [0, 1, 2, 3, 7], "disable_ocr": True})
def test_pdf_converter_bytes(pdf_converter: PdfConverter, temp_doc):
with open(temp_doc.name, "rb") as f:
data = f.read()
input_bytes = io.BytesIO(data)
markdown_output: MarkdownOutput = pdf_converter(input_bytes)
markdown = markdown_output.markdown
# Basic assertions
assert len(markdown) > 0
assert "# Subspace Adversarial Training" in markdown
# Some assertions for line joining across pages
assert (
"AT solutions. However, these methods highly rely on specifically" in markdown
) # pgs: 1-2
assert (
"(with adversarial perturbations), which harms natural accuracy, " in markdown
) # pgs: 3-4
# Some assertions for line joining across columns
assert "remain similar across a wide range of choices." in markdown # pg: 2
assert "a new scheme for designing more robust and efficient" in markdown # pg: 8
```
--------------------------------------------------------------------------------
/benchmarks/table/table.py:
--------------------------------------------------------------------------------
```python
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for an op, which is not supported on MPS
from pathlib import Path
from itertools import repeat
from typing import List
import time
import datasets
from tqdm import tqdm
import click
from tabulate import tabulate
import json
from concurrent.futures import ProcessPoolExecutor
from marker.settings import settings
from benchmarks.table.inference import inference_tables
from scoring import wrap_table_html, similarity_eval_html
def update_teds_score(result, prefix: str = "marker"):
prediction, ground_truth = result[f'{prefix}_table'], result['gt_table']
prediction, ground_truth = wrap_table_html(prediction), wrap_table_html(ground_truth)
score = similarity_eval_html(prediction, ground_truth)
result.update({f'{prefix}_score':score})
return result
@click.command(help="Benchmark Table to HTML Conversion")
@click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "table"), help="Output path for results.")
@click.option("--dataset", type=str, default="datalab-to/fintabnet_bench_marker", help="Dataset to use")
@click.option("--max_rows", type=int, default=None, help="Maximum number of PDFs to process")
@click.option("--max_workers", type=int, default=16, help="Maximum number of workers to use")
@click.option("--use_llm", is_flag=True, help="Use LLM for improving table recognition.")
@click.option("--table_rec_batch_size", type=int, default=None, help="Batch size for table recognition.")
@click.option("--use_gemini", is_flag=True, help="Evaluate Gemini for table recognition.")
def main(
result_path: str,
dataset: str,
max_rows: int,
max_workers: int,
use_llm: bool,
table_rec_batch_size: int | None,
use_gemini: bool = False
):
start = time.time()
dataset = datasets.load_dataset(dataset, split='train')
dataset = dataset.shuffle(seed=0)
results, total_unaligned = inference_tables(dataset, use_llm, table_rec_batch_size, max_rows, use_gemini)
print(f"Total time: {time.time() - start}.")
print(f"Could not align {total_unaligned} tables from fintabnet.")
with ProcessPoolExecutor(max_workers=max_workers) as executor:
marker_results = list(
tqdm(
executor.map(update_teds_score, results), desc='Computing alignment scores', total=len(results)
)
)
avg_score = sum([r["marker_score"] for r in marker_results]) / len(marker_results)
headers = ["Avg score", "Total tables"]
data = [f"{avg_score:.3f}", len(marker_results)]
gemini_results = None
if use_gemini:
with ProcessPoolExecutor(max_workers=max_workers) as executor:
gemini_results = list(
tqdm(
executor.map(update_teds_score, results, repeat("gemini")), desc='Computing Gemini scores',
total=len(results)
)
)
avg_gemini_score = sum([r["gemini_score"] for r in gemini_results]) / len(gemini_results)
headers.append("Avg Gemini score")
data.append(f"{avg_gemini_score:.3f}")
table = tabulate([data], headers=headers, tablefmt="github")
print(table)
print("Avg score computed by comparing marker predicted HTML with original HTML")
results = {
"marker": marker_results,
"gemini": gemini_results
}
out_path = Path(result_path)
out_path.mkdir(parents=True, exist_ok=True)
with open(out_path / "table.json", "w+") as f:
json.dump(results, f, indent=2)
print(f"Results saved to {out_path}.")
if __name__ == '__main__':
main()
```
--------------------------------------------------------------------------------
/benchmarks/table/scoring.py:
--------------------------------------------------------------------------------
```python
""""
TEDS Code Adapted from https://github.com/ibm-aur-nlp/EDD
"""
import distance
from apted import APTED, Config
from apted.helpers import Tree
from lxml import html
from collections import deque
def wrap_table_html(table_html:str)->str:
return f'<html><body>{table_html}</body></html>'
class TableTree(Tree):
def __init__(self, tag, colspan=None, rowspan=None, content=None, *children):
self.tag = tag
self.colspan = colspan
self.rowspan = rowspan
self.content = content
# Sets self.name and self.children
super().__init__(tag, *children)
def bracket(self):
"""Show tree using brackets notation"""
if self.tag == 'td':
result = '"tag": %s, "colspan": %d, "rowspan": %d, "text": %s' % \
(self.tag, self.colspan, self.rowspan, self.content)
else:
result = '"tag": %s' % self.tag
for child in self.children:
result += child.bracket()
return "{{{}}}".format(result)
class CustomConfig(Config):
@staticmethod
def maximum(*sequences):
return max(map(len, sequences))
def normalized_distance(self, *sequences):
return float(distance.levenshtein(*sequences)) / self.maximum(*sequences)
def rename(self, node1, node2):
if (node1.tag != node2.tag) or (node1.colspan != node2.colspan) or (node1.rowspan != node2.rowspan):
return 1.
if node1.tag == 'td':
if node1.content or node2.content:
return self.normalized_distance(node1.content, node2.content)
return 0.
def tokenize(node):
"""
Tokenizes table cells
"""
global __tokens__
__tokens__.append('<%s>' % node.tag)
if node.text is not None:
__tokens__ += list(node.text)
for n in node.getchildren():
tokenize(n)
if node.tag != 'unk':
__tokens__.append('</%s>' % node.tag)
if node.tag != 'td' and node.tail is not None:
__tokens__ += list(node.tail)
def tree_convert_html(node, convert_cell=False, parent=None):
"""
Converts HTML tree to the format required by apted
"""
global __tokens__
if node.tag == 'td':
if convert_cell:
__tokens__ = []
tokenize(node)
cell = __tokens__[1:-1].copy()
else:
cell = []
new_node = TableTree(node.tag,
int(node.attrib.get('colspan', '1')),
int(node.attrib.get('rowspan', '1')),
cell, *deque())
else:
new_node = TableTree(node.tag, None, None, None, *deque())
if parent is not None:
parent.children.append(new_node)
if node.tag != 'td':
for n in node.getchildren():
tree_convert_html(n, convert_cell, new_node)
if parent is None:
return new_node
def similarity_eval_html(pred, true, structure_only=False):
"""
Computes TEDS score between the prediction and the ground truth of a given samples
"""
pred, true = html.fromstring(pred), html.fromstring(true)
if pred.xpath('body/table') and true.xpath('body/table'):
pred = pred.xpath('body/table')[0]
true = true.xpath('body/table')[0]
n_nodes_pred = len(pred.xpath(".//*"))
n_nodes_true = len(true.xpath(".//*"))
tree_pred = tree_convert_html(pred, convert_cell=not structure_only)
tree_true = tree_convert_html(true, convert_cell=not structure_only)
n_nodes = max(n_nodes_pred, n_nodes_true)
distance = APTED(tree_pred, tree_true, CustomConfig()).compute_edit_distance()
return 1.0 - (float(distance) / n_nodes)
else:
return 0.0
```
--------------------------------------------------------------------------------
/marker/schema/text/span.py:
--------------------------------------------------------------------------------
```python
import html
import re
from typing import List, Literal, Optional
from marker.schema import BlockTypes
from marker.schema.blocks import Block
from marker.util import unwrap_math
def cleanup_text(full_text):
full_text = re.sub(r"(\n\s){3,}", "\n\n", full_text)
full_text = full_text.replace("\xa0", " ") # Replace non-breaking spaces
return full_text
class Span(Block):
block_type: BlockTypes = BlockTypes.Span
block_description: str = "A span of text inside a line."
text: str
font: str
font_weight: float
font_size: float
minimum_position: int
maximum_position: int
formats: List[
Literal[
"plain",
"math",
"chemical",
"bold",
"italic",
"highlight",
"subscript",
"superscript",
"small",
"code",
"underline",
]
]
has_superscript: bool = False
has_subscript: bool = False
url: Optional[str] = None
html: Optional[str] = None
@property
def bold(self):
return "bold" in self.formats
@property
def italic(self):
return "italic" in self.formats
@property
def math(self):
return "math" in self.formats
@property
def highlight(self):
return "highlight" in self.formats
@property
def superscript(self):
return "superscript" in self.formats
@property
def subscript(self):
return "subscript" in self.formats
@property
def small(self):
return "small" in self.formats
@property
def code(self):
return "code" in self.formats
@property
def underline(self):
return "underline" in self.formats
def assemble_html(self, document, child_blocks, parent_structure, block_config):
if self.ignore_for_output:
return ""
if self.html:
return self.html
text = self.text
# Remove trailing newlines
replaced_newline = False
while len(text) > 0 and text[-1] in ["\n", "\r"]:
text = text[:-1]
replaced_newline = True
# Remove leading newlines
while len(text) > 0 and text[0] in ["\n", "\r"]:
text = text[1:]
if replaced_newline and not text.endswith("-"):
text += " "
text = text.replace(
"-\n", ""
) # Remove hyphenated line breaks from the middle of the span
text = html.escape(text)
text = cleanup_text(text)
if self.has_superscript:
text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", text)
# Handle full block superscript
if "<sup>" not in text:
text = f"<sup>{text}</sup>"
if self.url:
text = f"<a href='{self.url}'>{text}</a>"
# TODO Support multiple formats
if self.italic:
text = f"<i>{text}</i>"
elif self.bold:
text = f"<b>{text}</b>"
elif self.math:
block_envs = ["split", "align", "gather", "multline"]
if any(f"\\begin{{{env}}}" in text for env in block_envs):
display_mode = "block"
else:
display_mode = "inline"
text = f"<math display='{display_mode}'>{text}</math>"
elif self.highlight:
text = f"<mark>{text}</mark>"
elif self.subscript:
text = f"<sub>{text}</sub>"
elif self.superscript:
text = f"<sup>{text}</sup>"
elif self.underline:
text = f"<u>{text}</u>"
elif self.small:
text = f"<small>{text}</small>"
elif self.code:
text = f"<code>{text}</code>"
text = unwrap_math(text)
return text
```
--------------------------------------------------------------------------------
/marker/processors/llm/llm_handwriting.py:
--------------------------------------------------------------------------------
```python
import markdown2
from pydantic import BaseModel
from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData
from marker.schema import BlockTypes
from marker.schema.document import Document
from typing import Annotated, List
class LLMHandwritingProcessor(BaseLLMSimpleBlockProcessor):
block_types = (BlockTypes.Handwriting, BlockTypes.Text)
handwriting_generation_prompt: Annotated[
str,
"The prompt to use for OCRing handwriting.",
"Default is a string containing the Gemini prompt."
] = """You are an expert editor specializing in accurately reproducing text from images.
You will receive an image of a text block. Your task is to generate markdown to properly represent the content of the image. Do not omit any text present in the image - make sure everything is included in the markdown representation. The markdown representation should be as faithful to the original image as possible.
Formatting should be in markdown, with the following rules:
- * for italics, ** for bold, and ` for inline code.
- Headers should be formatted with #, with one # for the largest header, and up to 6 for the smallest.
- Lists should be formatted with either - or 1. for unordered and ordered lists, respectively.
- Links should be formatted with [text](url).
- Use ``` for code blocks.
- Inline math should be formatted with <math>math expression</math>.
- Display math should be formatted with <math display="block">math expression</math>.
- Values and labels should be extracted from forms, and put into markdown tables, with the labels on the left side, and values on the right. The headers should be "Labels" and "Values". Other text in the form can appear between the tables.
- Tables should be formatted with markdown tables, with the headers bolded.
**Instructions:**
1. Carefully examine the provided block image.
2. Output the markdown representing the content of the image.
"""
def inference_blocks(self, document: Document) -> List[BlockData]:
blocks = super().inference_blocks(document)
out_blocks = []
for block_data in blocks:
raw_text = block_data["block"].raw_text(document)
block = block_data["block"]
# Don't process text blocks that contain lines already
if block.block_type == BlockTypes.Text:
lines = block.contained_blocks(document, (BlockTypes.Line,))
if len(lines) > 0 or len(raw_text.strip()) > 0:
continue
out_blocks.append(block_data)
return out_blocks
def block_prompts(self, document: Document) -> List[PromptData]:
prompt_data = []
for block_data in self.inference_blocks(document):
block = block_data["block"]
prompt = self.handwriting_generation_prompt
image = self.extract_image(document, block)
prompt_data.append({
"prompt": prompt,
"image": image,
"block": block,
"schema": HandwritingSchema,
"page": block_data["page"]
})
return prompt_data
def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
block = prompt_data["block"]
raw_text = block.raw_text(document)
if not response or "markdown" not in response:
block.update_metadata(llm_error_count=1)
return
markdown = response["markdown"]
if len(markdown) < len(raw_text) * .5:
block.update_metadata(llm_error_count=1)
return
markdown = markdown.strip().lstrip("```markdown").rstrip("```").strip()
block.html = markdown2.markdown(markdown, extras=["tables"])
class HandwritingSchema(BaseModel):
markdown: str
```
--------------------------------------------------------------------------------
/marker/services/azure_openai.py:
--------------------------------------------------------------------------------
```python
import json
import time
from typing import Annotated, List
import PIL
from marker.logger import get_logger
from openai import AzureOpenAI, APITimeoutError, RateLimitError
from PIL import Image
from pydantic import BaseModel
from marker.schema.blocks import Block
from marker.services import BaseService
logger = get_logger()
class AzureOpenAIService(BaseService):
azure_endpoint: Annotated[
str, "The Azure OpenAI endpoint URL. No trailing slash."
] = None
azure_api_key: Annotated[
str, "The API key to use for the Azure OpenAI service."
] = None
azure_api_version: Annotated[str, "The Azure OpenAI API version to use."] = None
deployment_name: Annotated[
str, "The deployment name for the Azure OpenAI model."
] = None
def process_images(self, images: List[PIL.Image.Image]) -> list:
if isinstance(images, Image.Image):
images = [images]
return [
{
"type": "image_url",
"image_url": {
"url": "data:image/webp;base64,{}".format(self.img_to_base64(img)),
},
}
for img in images
]
def __call__(
self,
prompt: str,
image: PIL.Image.Image | List[PIL.Image.Image] | None,
block: Block | None,
response_schema: type[BaseModel],
max_retries: int | None = None,
timeout: int | None = None,
):
if max_retries is None:
max_retries = self.max_retries
if timeout is None:
timeout = self.timeout
client = self.get_client()
image_data = self.format_image_for_llm(image)
messages = [
{
"role": "user",
"content": [
*image_data,
{"type": "text", "text": prompt},
],
}
]
total_tries = max_retries + 1
for tries in range(1, total_tries + 1):
try:
response = client.beta.chat.completions.parse(
extra_headers={
"X-Title": "Marker",
"HTTP-Referer": "https://github.com/datalab-to/marker",
},
model=self.deployment_name,
messages=messages,
timeout=timeout,
response_format=response_schema,
)
response_text = response.choices[0].message.content
total_tokens = response.usage.total_tokens
if block:
block.update_metadata(
llm_tokens_used=total_tokens, llm_request_count=1
)
return json.loads(response_text)
except (APITimeoutError, RateLimitError) as e:
# Rate limit exceeded
if tries == total_tries:
# Last attempt failed. Give up
logger.error(
f"Rate limit error: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})"
)
break
else:
wait_time = tries * self.retry_wait_time
logger.warning(
f"Rate limit error: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})"
)
time.sleep(wait_time)
except Exception as e:
logger.error(f"Azure OpenAI inference failed: {e}")
break
return {}
def get_client(self) -> AzureOpenAI:
return AzureOpenAI(
api_version=self.azure_api_version,
azure_endpoint=self.azure_endpoint,
api_key=self.azure_api_key,
)
```
--------------------------------------------------------------------------------
/benchmarks/overall/scorers/clean.py:
--------------------------------------------------------------------------------
```python
import re
import subprocess
import tempfile
from pathlib import Path
import latex2mathml.converter
class MarkdownCleaner:
def __init__(self):
pass
def __call__(self, markdown):
markdown = self.normalize_markdown(markdown) # Use pandoc to normalize
# Replace math expressions with latexml
pattern = r'(?<!\\)\$(?:\$([^$]+)\$\$|\s*([^$\n]+?)\s*\$)'
markdown = re.sub(pattern, self.standardize_math, markdown)
# Replace image urls with a generic tag
pattern = r'!\[(.*?)\]\((https?://[^\s\)]+)\)'
markdown = re.sub(pattern, r'![link]', markdown)
# Clean up stray html tags
markdown = markdown.replace("<br>", "\n")
markdown = re.sub(r"<sub>(.*?)</sub>", r"\1", markdown)
markdown = re.sub(r"<sup>(.*?)</sup>", r"\1", markdown)
markdown = re.sub(r"<span.*?>(.*?)</span>", r"\1", markdown) # Remove span tags and keep content
# Clean up markdown formatting
markdown = re.sub(r"\s+", " ", markdown)
markdown = re.sub(r"\n+", "\n", markdown)
markdown = re.sub("\\.+", ".",
markdown) # Replace repeated periods with a single period, like in table of contents
markdown = re.sub("#+", "#", markdown) # Replace repeated headers with a single header
markdown = markdown.encode().decode('unicode-escape', errors="ignore") # Decode unicode characters properly
return markdown.strip().lower()
@staticmethod
def normalize_markdown(md_text: str) -> str:
with tempfile.TemporaryDirectory() as tmp_dir:
dirpath = Path(tmp_dir)
input_file = dirpath / 'input.md'
input_file.write_text(md_text, encoding='utf-8')
# Markdown to HTML
html_file = dirpath / 'temp.html'
subprocess.run(
[
'pandoc',
str(input_file),
'-f', 'markdown+tex_math_dollars',
'-t', 'html',
'-o', str(html_file),
'--quiet'
],
check=True
)
# HTML to Markdown
output_file = dirpath / 'output.md'
subprocess.run(
[
'pandoc',
str(html_file),
'-f', 'html',
'-t', 'markdown+tex_math_dollars',
'-o', str(output_file),
'--quiet'
],
check=True
)
# Read back the normalized Markdown
normalized_md = output_file.read_text(encoding='utf-8')
return normalized_md
def standardize_math(self, match):
try:
delim = "$$" if match.group(0).startswith('$$') else "$"
math_content = match.group(1) or match.group(2)
if delim == "$$":
math_content = latex2mathml.converter.convert(math_content)
else:
math_content = self.clean_latex(math_content)
return f'{delim}{math_content}{delim}'
except Exception as e:
print(f"Failed to standardize math expression: {match.group(0)} with error: {e}")
return match.group(0)
@staticmethod
def clean_latex(latex_str):
latex_str = re.sub(r'\s+', ' ', latex_str.strip())
for tag in [r'\\text', r'\\mathrm', r'\\mathbf', r'\\textbf']:
latex_str = re.sub(tag + r'\{([^}]+)\}', r'\1', latex_str)
replacements = {
'\\times': '*',
'\\cdot': '*',
'\\div': '/',
'\\le': '<=',
'\\ge': '>=',
'\\neq': '!=',
'\\to': '\\rightarrow',
}
for old, new in replacements.items():
latex_str = latex_str.replace(old, new)
return latex_str
```
--------------------------------------------------------------------------------
/marker/utils/gpu.py:
--------------------------------------------------------------------------------
```python
import os
import subprocess
import torch
from marker.logger import get_logger
from marker.settings import settings
logger = get_logger()
class GPUManager:
default_gpu_vram: int = 8
def __init__(self, device_idx: int):
self.device_idx = device_idx
self.original_compute_mode = None
self.mps_server_process = None
def __enter__(self):
if self.using_cuda():
self.start_mps_server()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if self.using_cuda():
self.cleanup()
@staticmethod
def using_cuda():
return "cuda" in settings.TORCH_DEVICE_MODEL
def check_cuda_available(self) -> bool:
if not torch.cuda.is_available():
return False
try:
subprocess.run(["nvidia-smi", "--version"], capture_output=True, check=True)
return True
except (subprocess.CalledProcessError, FileNotFoundError):
return False
def get_gpu_vram(self):
if not self.using_cuda():
return self.default_gpu_vram
try:
result = subprocess.run(
[
"nvidia-smi",
"--query-gpu=memory.total",
"--format=csv,noheader,nounits",
"-i",
str(self.device_idx),
],
capture_output=True,
text=True,
check=True,
)
vram_mb = int(result.stdout.strip())
vram_gb = int(vram_mb / 1024)
return vram_gb
except (subprocess.CalledProcessError, ValueError, FileNotFoundError):
return self.default_gpu_vram
def start_mps_server(self) -> bool:
if not self.check_cuda_available():
return False
try:
# Set MPS environment with chunk-specific directories
env = os.environ.copy()
pipe_dir = f"/tmp/nvidia-mps-{self.device_idx}"
log_dir = f"/tmp/nvidia-log-{self.device_idx}"
env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
# Create directories
os.makedirs(pipe_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
# Start MPS control daemon
self.mps_server_process = subprocess.Popen(
["nvidia-cuda-mps-control", "-d"],
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
logger.info(f"Started NVIDIA MPS server for chunk {self.device_idx}")
return True
except (subprocess.CalledProcessError, FileNotFoundError) as e:
logger.warning(
f"Failed to start MPS server for chunk {self.device_idx}: {e}"
)
return False
def stop_mps_server(self) -> None:
try:
# Stop MPS server
env = os.environ.copy()
env["CUDA_MPS_PIPE_DIRECTORY"] = f"/tmp/nvidia-mps-{self.device_idx}"
env["CUDA_MPS_LOG_DIRECTORY"] = f"/tmp/nvidia-log-{self.device_idx}"
subprocess.run(
["nvidia-cuda-mps-control"],
input="quit\n",
text=True,
env=env,
timeout=10,
)
if self.mps_server_process:
self.mps_server_process.terminate()
try:
self.mps_server_process.wait(timeout=5)
except subprocess.TimeoutExpired:
self.mps_server_process.kill()
self.mps_server_process = None
logger.info(f"Stopped NVIDIA MPS server for chunk {self.device_idx}")
except Exception as e:
logger.warning(
f"Failed to stop MPS server for chunk {self.device_idx}: {e}"
)
def cleanup(self) -> None:
self.stop_mps_server()
```
--------------------------------------------------------------------------------
/marker/processors/sectionheader.py:
--------------------------------------------------------------------------------
```python
import warnings
from typing import Annotated, Dict, List
import numpy as np
from sklearn.cluster import KMeans
from sklearn.exceptions import ConvergenceWarning
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
# Ignore sklearn warning about not converging
warnings.filterwarnings("ignore", category=ConvergenceWarning)
class SectionHeaderProcessor(BaseProcessor):
"""
A processor for recognizing section headers in the document.
"""
block_types = (BlockTypes.SectionHeader, )
level_count: Annotated[
int,
"The number of levels to use for headings.",
] = 4
merge_threshold: Annotated[
float,
"The minimum gap between headings to consider them part of the same group.",
] = 0.25
default_level: Annotated[
int,
"The default heading level to use if no heading level is detected.",
] = 2
height_tolerance: Annotated[
float,
"The minimum height of a heading to consider it a heading.",
] = 0.99
def __call__(self, document: Document):
line_heights: Dict[int, float] = {}
for page in document.pages:
# Iterate children to grab all section headers
for block in page.children:
if block.block_type not in self.block_types:
continue
if block.structure is not None:
line_heights[block.id] = block.line_height(document)
else:
line_heights[block.id] = 0
block.ignore_for_output = True # Don't output an empty section header
flat_line_heights = list(line_heights.values())
heading_ranges = self.bucket_headings(flat_line_heights)
for page in document.pages:
# Iterate children to grab all section headers
for block in page.children:
if block.block_type not in self.block_types:
continue
block_height = line_heights.get(block.id, 0)
if block_height > 0:
for idx, (min_height, max_height) in enumerate(heading_ranges):
if block_height >= min_height * self.height_tolerance:
block.heading_level = idx + 1
break
if block.heading_level is None:
block.heading_level = self.default_level
def bucket_headings(self, line_heights: List[float], num_levels=4):
if len(line_heights) <= self.level_count:
return []
data = np.asarray(line_heights).reshape(-1, 1)
labels = KMeans(n_clusters=num_levels, random_state=0, n_init="auto").fit_predict(data)
data_labels = np.concatenate([data, labels.reshape(-1, 1)], axis=1)
data_labels = np.sort(data_labels, axis=0)
cluster_means = {int(label): float(np.mean(data_labels[data_labels[:, 1] == label, 0])) for label in np.unique(labels)}
label_max = None
label_min = None
heading_ranges = []
prev_cluster = None
for row in data_labels:
value, label = row
value = float(value)
label = int(label)
if prev_cluster is not None and label != prev_cluster:
prev_cluster_mean = cluster_means[prev_cluster]
cluster_mean = cluster_means[label]
if cluster_mean * self.merge_threshold < prev_cluster_mean:
heading_ranges.append((label_min, label_max))
label_min = None
label_max = None
label_min = value if label_min is None else min(label_min, value)
label_max = value if label_max is None else max(label_max, value)
prev_cluster = label
if label_min is not None:
heading_ranges.append((label_min, label_max))
heading_ranges = sorted(heading_ranges, reverse=True)
return heading_ranges
```
--------------------------------------------------------------------------------
/marker/config/printer.py:
--------------------------------------------------------------------------------
```python
from typing import Optional
import click
from marker.config.crawler import crawler
class CustomClickPrinter(click.Command):
def parse_args(self, ctx, args):
display_help = "config" in args and "--help" in args
if display_help:
click.echo(
"Here is a list of all the Builders, Processors, Converters, Providers and Renderers in Marker along with their attributes:"
)
# Keep track of shared attributes and their types
shared_attrs = {}
# First pass: identify shared attributes and verify compatibility
for base_type, base_type_dict in crawler.class_config_map.items():
for class_name, class_map in base_type_dict.items():
for attr, (attr_type, formatted_type, default, metadata) in class_map[
"config"
].items():
if attr not in shared_attrs:
shared_attrs[attr] = {
"classes": [],
"type": attr_type,
"is_flag": attr_type in [bool, Optional[bool]]
and not default,
"metadata": metadata,
"default": default,
}
shared_attrs[attr]["classes"].append(class_name)
# These are the types of attrs that can be set from the command line
attr_types = [
str,
int,
float,
bool,
Optional[int],
Optional[float],
Optional[str],
]
# Add shared attribute options first
for attr, info in shared_attrs.items():
if info["type"] in attr_types:
ctx.command.params.append(
click.Option(
["--" + attr],
type=info["type"],
help=" ".join(info["metadata"])
+ f" (Applies to: {', '.join(info['classes'])})",
default=None, # This is important, or it sets all the default keys again in config
is_flag=info["is_flag"],
flag_value=True if info["is_flag"] else None,
)
)
# Second pass: create class-specific options
for base_type, base_type_dict in crawler.class_config_map.items():
if display_help:
click.echo(f"{base_type}s:")
for class_name, class_map in base_type_dict.items():
if display_help and class_map["config"]:
click.echo(
f"\n {class_name}: {class_map['class_type'].__doc__ or ''}"
)
click.echo(" " * 4 + "Attributes:")
for attr, (attr_type, formatted_type, default, metadata) in class_map[
"config"
].items():
class_name_attr = class_name + "_" + attr
if display_help:
click.echo(" " * 8 + f"{attr} ({formatted_type}):")
click.echo(
"\n".join([f"{' ' * 12}" + desc for desc in metadata])
)
if attr_type in attr_types:
is_flag = attr_type in [bool, Optional[bool]] and not default
# Only add class-specific options
ctx.command.params.append(
click.Option(
["--" + class_name_attr, class_name_attr],
type=attr_type,
help=" ".join(metadata),
is_flag=is_flag,
default=None, # This is important, or it sets all the default keys again in config
)
)
if display_help:
ctx.exit()
super().parse_args(ctx, args)
```
--------------------------------------------------------------------------------
/marker/processors/text.py:
--------------------------------------------------------------------------------
```python
import math
from typing import Annotated, List
import regex
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.schema.text.line import Line
class TextProcessor(BaseProcessor):
"""
A processor for merging text across pages and columns.
"""
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
ignored_block_types = (BlockTypes.PageHeader, BlockTypes.PageFooter)
column_gap_ratio: Annotated[
float,
"The minimum ratio of the page width to the column gap to consider a column break.",
] = 0.02
def __init__(self, config):
super().__init__(config)
def __call__(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
if block.structure is None:
continue
if not len(block.structure) >= 2: # Skip single lines
continue
next_block = document.get_next_block(block, self.ignored_block_types)
if next_block is None: # we've reached the end of the document
continue
if next_block.block_type not in self.block_types:
continue # we found a non-text block
if next_block.structure is None:
continue # This is odd though, why do we have text blocks with no structure?
if next_block.ignore_for_output:
continue # skip ignored blocks
column_gap = block.polygon.width * self.column_gap_ratio
column_break, page_break = False, False
next_block_starts_indented = True
next_block_in_first_quadrant = False
last_line_is_full_width = False
last_line_is_hyphentated = False
if next_block.page_id == block.page_id: # block on the same page
# we check for a column break
column_break = math.floor(next_block.polygon.y_start) <= math.ceil(
block.polygon.y_start
) and next_block.polygon.x_start > (
block.polygon.x_end + column_gap
)
else:
page_break = True
next_page = document.get_page(next_block.page_id)
next_block_in_first_quadrant = (
next_block.polygon.x_start < next_page.polygon.width // 2
) and (next_block.polygon.y_start < next_page.polygon.height // 2)
if not (column_break or page_break):
continue
new_block_lines = next_block.structure_blocks(document)
# we check for next_block indentation
if len(new_block_lines):
min_x = math.ceil(
min([line.polygon.x_start for line in new_block_lines])
)
next_block_starts_indented = (
new_block_lines[0].polygon.x_start > min_x
)
lines: List[Line] = [
line
for line in block.structure_blocks(document)
if line.polygon.width > 1
]
if len(lines):
max_x = math.floor(max([line.polygon.x_end for line in lines]))
last_line_is_full_width = lines[-1].polygon.x_end >= max_x
last_line_is_hyphentated = regex.compile(
r".*[\p{Ll}|\d][-—¬]\s?$", regex.DOTALL
).match(lines[-1].raw_text(document).strip())
if (
(last_line_is_full_width or last_line_is_hyphentated)
and not next_block_starts_indented
and ((next_block_in_first_quadrant and page_break) or column_break)
):
block.has_continuation = True
```
--------------------------------------------------------------------------------
/benchmarks/overall/methods/__init__.py:
--------------------------------------------------------------------------------
```python
import io
import random
import re
from typing import Tuple
import markdown2
from PIL import Image
from playwright.sync_api import sync_playwright
from benchmarks.overall.methods.schema import BenchmarkResult
from marker.renderers.markdown import MarkdownRenderer
class BaseMethod:
def __init__(self, **kwargs):
for kwarg in kwargs:
if hasattr(self, kwarg):
setattr(self, kwarg, kwargs[kwarg])
@staticmethod
def convert_to_md(html: str):
md = MarkdownRenderer()
markdown = md.md_cls.convert(html)
return markdown
def __call__(self, sample) -> BenchmarkResult:
raise NotImplementedError()
def render(self, markdown: str):
return self.html_to_image(self.convert_to_html(markdown))
@staticmethod
def convert_to_html(md: str):
block_placeholders = []
inline_placeholders = []
# Add placeholders for the math
def block_sub(match):
content = match.group(1)
placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
block_placeholders.append((placeholder, f"$${content}$$"))
return placeholder
def inline_sub(match):
content = match.group(1)
placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
inline_placeholders.append((placeholder, f"${content}$"))
return placeholder
md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
md = re.sub(r'\$(.*?)\$', inline_sub, md)
html = markdown2.markdown(md, extras=['tables'])
# Replace placeholders
for placeholder, math_str in block_placeholders:
html = html.replace(placeholder, math_str)
for placeholder, math_str in inline_placeholders:
html = html.replace(placeholder, math_str)
return html
def html_to_image(self, html: str) -> Image.Image:
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
html_str = f"""
<!DOCTYPE html>
<html>
<head>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
<!-- The loading of KaTeX is deferred to speed up page rendering -->
<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
<!-- To automatically render math in text elements, include the auto-render extension: -->
<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
</head>
<body>
{html}
<script>
document.addEventListener("DOMContentLoaded", function() {{
renderMathInElement(document.body, {{
delimiters: [
{{left: '$$', right: '$$', display: true}},
{{left: '$', right: '$', display: false}}
],
throwOnError : false
}});
}});
</script>
</body>
</html>
""".strip()
page.set_viewport_size({"width": 1200, "height": 800})
page.set_content(html_str)
page.wait_for_load_state("domcontentloaded")
page.wait_for_timeout(500) # Wait for KaTeX to render
screenshot_bytes = page.screenshot(full_page=True)
browser.close()
return Image.open(io.BytesIO(screenshot_bytes))
```
--------------------------------------------------------------------------------
/marker/processors/llm/llm_form.py:
--------------------------------------------------------------------------------
```python
from typing import List
from pydantic import BaseModel
from marker.output import json_to_html
from marker.processors.llm import PromptData, BaseLLMSimpleBlockProcessor, BlockData
from marker.schema import BlockTypes
from marker.schema.document import Document
class LLMFormProcessor(BaseLLMSimpleBlockProcessor):
block_types = (BlockTypes.Form,)
form_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
You will receive an image of a text block and an html representation of the form in the image.
Your task is to correct any errors in the html representation, and format it properly.
Values and labels should appear in html tables, with the labels on the left side, and values on the right. Other text in the form can appear between the tables. Only use the tags `table, p, span, i, b, th, td, tr, and div`. Do not omit any text from the form - make sure everything is included in the html representation. It should be as faithful to the original form as possible.
**Instructions:**
1. Carefully examine the provided form block image.
2. Analyze the html representation of the form.
3. Compare the html representation to the image.
4. If the html representation is correct, or you cannot read the image properly, then write "No corrections needed."
5. If the html representation contains errors, generate the corrected html representation.
6. Output only either the corrected html representation or "No corrections needed."
**Example:**
Input:
```html
<table>
<tr>
<td>Label 1</td>
<td>Label 2</td>
<td>Label 3</td>
</tr>
<tr>
<td>Value 1</td>
<td>Value 2</td>
<td>Value 3</td>
</tr>
</table>
```
Output:
Comparison: The html representation has the labels in the first row and the values in the second row. It should be corrected to have the labels on the left side and the values on the right side.
```html
<table>
<tr>
<td>Label 1</td>
<td>Value 1</td>
</tr>
<tr>
<td>Label 2</td>
<td>Value 2</td>
</tr>
<tr>
<td>Label 3</td>
<td>Value 3</td>
</tr>
</table>
```
**Input:**
```html
{block_html}
```
"""
def inference_blocks(self, document: Document) -> List[BlockData]:
blocks = super().inference_blocks(document)
out_blocks = []
for block_data in blocks:
block = block_data["block"]
children = block.contained_blocks(document, (BlockTypes.TableCell,))
if not children:
continue
out_blocks.append(block_data)
return out_blocks
def block_prompts(self, document: Document) -> List[PromptData]:
prompt_data = []
for block_data in self.inference_blocks(document):
block = block_data["block"]
block_html = json_to_html(block.render(document))
prompt = self.form_rewriting_prompt.replace("{block_html}", block_html)
image = self.extract_image(document, block)
prompt_data.append({
"prompt": prompt,
"image": image,
"block": block,
"schema": FormSchema,
"page": block_data["page"]
})
return prompt_data
def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
block = prompt_data["block"]
block_html = json_to_html(block.render(document))
if not response or "corrected_html" not in response:
block.update_metadata(llm_error_count=1)
return
corrected_html = response["corrected_html"]
# The original table is okay
if "no corrections needed" in corrected_html.lower():
return
# Potentially a partial response
if len(corrected_html) < len(block_html) * .33:
block.update_metadata(llm_error_count=1)
return
corrected_html = corrected_html.strip().lstrip("```html").rstrip("```").strip()
block.html = corrected_html
class FormSchema(BaseModel):
comparison: str
corrected_html: str
```
--------------------------------------------------------------------------------
/marker/renderers/ocr_json.py:
--------------------------------------------------------------------------------
```python
from typing import Annotated, List, Tuple
from pydantic import BaseModel
from marker.renderers import BaseRenderer
from marker.schema import BlockTypes
from marker.schema.document import Document
class OCRJSONCharOutput(BaseModel):
id: str
block_type: str
text: str
polygon: List[List[float]]
bbox: List[float]
class OCRJSONLineOutput(BaseModel):
id: str
block_type: str
html: str
polygon: List[List[float]]
bbox: List[float]
children: List["OCRJSONCharOutput"] | None = None
class OCRJSONPageOutput(BaseModel):
id: str
block_type: str
polygon: List[List[float]]
bbox: List[float]
children: List[OCRJSONLineOutput] | None = None
class OCRJSONOutput(BaseModel):
children: List[OCRJSONPageOutput]
block_type: str = str(BlockTypes.Document)
metadata: dict | None = None
class OCRJSONRenderer(BaseRenderer):
"""
A renderer for OCR JSON output.
"""
image_blocks: Annotated[
Tuple[BlockTypes],
"The list of block types to consider as images.",
] = (BlockTypes.Picture, BlockTypes.Figure)
page_blocks: Annotated[
Tuple[BlockTypes],
"The list of block types to consider as pages.",
] = (BlockTypes.Page,)
def extract_json(self, document: Document) -> List[OCRJSONPageOutput]:
pages = []
for page in document.pages:
page_equations = [
b for b in page.children if b.block_type == BlockTypes.Equation
and not b.removed
]
equation_lines = []
for equation in page_equations:
if not equation.structure:
continue
equation_lines += [
line
for line in equation.structure
if line.block_type == BlockTypes.Line
]
page_lines = [
block
for block in page.children
if block.block_type == BlockTypes.Line
and block.id not in equation_lines
and not block.removed
]
lines = []
for line in page_lines + page_equations:
line_obj = OCRJSONLineOutput(
id=str(line.id),
block_type=str(line.block_type),
html="",
polygon=line.polygon.polygon,
bbox=line.polygon.bbox,
)
if line in page_equations:
line_obj.html = line.html
else:
line_obj.html = line.formatted_text(document)
spans = (
[document.get_block(span_id) for span_id in line.structure]
if line.structure
else []
)
children = []
for span in spans:
if not span.structure:
continue
span_chars = [
document.get_block(char_id) for char_id in span.structure
]
children.extend(
[
OCRJSONCharOutput(
id=str(char.id),
block_type=str(char.block_type),
text=char.text,
polygon=char.polygon.polygon,
bbox=char.polygon.bbox,
)
for char in span_chars
]
)
line_obj.children = children
lines.append(line_obj)
page = OCRJSONPageOutput(
id=str(page.id),
block_type=str(page.block_type),
polygon=page.polygon.polygon,
bbox=page.polygon.bbox,
children=lines,
)
pages.append(page)
return pages
def __call__(self, document: Document) -> OCRJSONOutput:
return OCRJSONOutput(children=self.extract_json(document), metadata=None)
```
--------------------------------------------------------------------------------
/marker/services/openai.py:
--------------------------------------------------------------------------------
```python
import json
import time
from typing import Annotated, List
import openai
import PIL
from marker.logger import get_logger
from openai import APITimeoutError, RateLimitError
from PIL import Image
from pydantic import BaseModel
from marker.schema.blocks import Block
from marker.services import BaseService
logger = get_logger()
class OpenAIService(BaseService):
openai_base_url: Annotated[
str, "The base url to use for OpenAI-like models. No trailing slash."
] = "https://api.openai.com/v1"
openai_model: Annotated[str, "The model name to use for OpenAI-like model."] = (
"gpt-4o-mini"
)
openai_api_key: Annotated[
str, "The API key to use for the OpenAI-like service."
] = None
openai_image_format: Annotated[
str,
"The image format to use for the OpenAI-like service. Use 'png' for better compatability",
] = "webp"
def process_images(self, images: List[Image.Image]) -> List[dict]:
"""
Generate the base-64 encoded message to send to an
openAI-compatabile multimodal model.
Args:
images: Image or list of PIL images to include
format: Format to use for the image; use "png" for better compatability.
Returns:
A list of OpenAI-compatbile multimodal messages containing the base64-encoded images.
"""
if isinstance(images, Image.Image):
images = [images]
img_fmt = self.openai_image_format
return [
{
"type": "image_url",
"image_url": {
"url": "data:image/{};base64,{}".format(
img_fmt, self.img_to_base64(img, format=img_fmt)
),
},
}
for img in images
]
def __call__(
self,
prompt: str,
image: PIL.Image.Image | List[PIL.Image.Image] | None,
block: Block | None,
response_schema: type[BaseModel],
max_retries: int | None = None,
timeout: int | None = None,
):
if max_retries is None:
max_retries = self.max_retries
if timeout is None:
timeout = self.timeout
client = self.get_client()
image_data = self.format_image_for_llm(image)
messages = [
{
"role": "user",
"content": [
*image_data,
{"type": "text", "text": prompt},
],
}
]
total_tries = max_retries + 1
for tries in range(1, total_tries + 1):
try:
response = client.beta.chat.completions.parse(
extra_headers={
"X-Title": "Marker",
"HTTP-Referer": "https://github.com/datalab-to/marker",
},
model=self.openai_model,
messages=messages,
timeout=timeout,
response_format=response_schema,
)
response_text = response.choices[0].message.content
total_tokens = response.usage.total_tokens
if block:
block.update_metadata(
llm_tokens_used=total_tokens, llm_request_count=1
)
return json.loads(response_text)
except (APITimeoutError, RateLimitError) as e:
# Rate limit exceeded
if tries == total_tries:
# Last attempt failed. Give up
logger.error(
f"Rate limit error: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})",
)
break
else:
wait_time = tries * self.retry_wait_time
logger.warning(
f"Rate limit error: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})",
)
time.sleep(wait_time)
except Exception as e:
logger.error(f"OpenAI inference failed: {e}")
break
return {}
def get_client(self) -> openai.OpenAI:
return openai.OpenAI(api_key=self.openai_api_key, base_url=self.openai_base_url)
```
--------------------------------------------------------------------------------
/marker/services/claude.py:
--------------------------------------------------------------------------------
```python
import json
import time
from typing import List, Annotated, T
import PIL
from PIL import Image
import anthropic
from anthropic import RateLimitError, APITimeoutError
from marker.logger import get_logger
from pydantic import BaseModel
from marker.schema.blocks import Block
from marker.services import BaseService
logger = get_logger()
class ClaudeService(BaseService):
claude_model_name: Annotated[
str, "The name of the Google model to use for the service."
] = "claude-3-7-sonnet-20250219"
claude_api_key: Annotated[str, "The Claude API key to use for the service."] = None
max_claude_tokens: Annotated[
int, "The maximum number of tokens to use for a single Claude request."
] = 8192
def process_images(self, images: List[Image.Image]) -> List[dict]:
return [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/webp",
"data": self.img_to_base64(img),
},
}
for img in images
]
def validate_response(self, response_text: str, schema: type[T]) -> T:
response_text = response_text.strip()
if response_text.startswith("```json"):
response_text = response_text[7:]
if response_text.endswith("```"):
response_text = response_text[:-3]
try:
# Try to parse as JSON first
out_schema = schema.model_validate_json(response_text)
out_json = out_schema.model_dump()
return out_json
except Exception:
try:
# Re-parse with fixed escapes
escaped_str = response_text.replace("\\", "\\\\")
out_schema = schema.model_validate_json(escaped_str)
return out_schema.model_dump()
except Exception:
return
def get_client(self):
return anthropic.Anthropic(
api_key=self.claude_api_key,
)
def __call__(
self,
prompt: str,
image: PIL.Image.Image | List[PIL.Image.Image] | None,
block: Block | None,
response_schema: type[BaseModel],
max_retries: int | None = None,
timeout: int | None = None,
):
if max_retries is None:
max_retries = self.max_retries
if timeout is None:
timeout = self.timeout
schema_example = response_schema.model_json_schema()
system_prompt = f"""
Follow the instructions given by the user prompt. You must provide your response in JSON format matching this schema:
{json.dumps(schema_example, indent=2)}
Respond only with the JSON schema, nothing else. Do not include ```json, ```, or any other formatting.
""".strip()
client = self.get_client()
image_data = self.format_image_for_llm(image)
messages = [
{
"role": "user",
"content": [
*image_data,
{"type": "text", "text": prompt},
],
}
]
total_tries = max_retries + 1
for tries in range(1, total_tries + 1):
try:
response = client.messages.create(
system=system_prompt,
model=self.claude_model_name,
max_tokens=self.max_claude_tokens,
messages=messages,
timeout=timeout,
)
# Extract and validate response
response_text = response.content[0].text
return self.validate_response(response_text, response_schema)
except (RateLimitError, APITimeoutError) as e:
# Rate limit exceeded
if tries == total_tries:
# Last attempt failed. Give up
logger.error(
f"Rate limit error: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})",
)
break
else:
wait_time = tries * self.retry_wait_time
logger.warning(
f"Rate limit error: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})",
)
time.sleep(wait_time)
except Exception as e:
logger.error(f"Error during Claude API call: {e}")
break
return {}
```
--------------------------------------------------------------------------------
/CLA.md:
--------------------------------------------------------------------------------
```markdown
Marker Contributor Agreement
This Marker Contributor Agreement ("MCA") applies to any contribution that you make to any product or project managed by us (the "project"), and sets out the intellectual property rights you grant to us in the contributed materials. The term "us" shall mean Endless Labs, Inc. The term "you" shall mean the person or entity identified below.
If you agree to be bound by these terms, sign by writing "I have read the CLA document and I hereby sign the CLA" in response to the CLA bot Github comment. Read this agreement carefully before signing. These terms and conditions constitute a binding legal agreement.
1. The term 'contribution' or 'contributed materials' means any source code, object code, patch, tool, sample, graphic, specification, manual, documentation, or any other material posted or submitted by you to the project.
2. With respect to any worldwide copyrights, or copyright applications and registrations, in your contribution:
- you hereby assign to us joint ownership, and to the extent that such assignment is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty free, unrestricted license to exercise all rights under those copyrights. This includes, at our option, the right to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements, including dual-license structures for commercial customers;
- you agree that each of us can do all things in relation to your contribution as if each of us were the sole owners, and if one of us makes a derivative work of your contribution, the one who makes the derivative work (or has it made will be the sole owner of that derivative work;
- you agree that you will not assert any moral rights in your contribution against us, our licensees or transferees;
- you agree that we may register a copyright in your contribution and exercise all ownership rights associated with it; and
- you agree that neither of us has any duty to consult with, obtain the consent of, pay or render an accounting to the other for any use or distribution of vour contribution.
3. With respect to any patents you own, or that you can license without payment to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty-free license to:
- make, have made, use, sell, offer to sell, import, and otherwise transfer your contribution in whole or in part, alone or in combination with or included in any product, work or materials arising out of the project to which your contribution was submitted, and
- at our option, to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements.
If you or your affiliates institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the contribution or any project it was submitted to constitutes direct or contributory patent infringement, then any patent licenses granted to you under this agreement for that contribution shall terminate as of the date such litigation is filed.
4. Except as set out above, you keep all right, title, and interest in your contribution. The rights that you grant to us under these terms are effective on the date you first submitted a contribution to us, even if your submission took place before the date you sign these terms. Any contribution we make available under any license will also be made available under a suitable FSF (Free Software Foundation) or OSI (Open Source Initiative) approved license.
5. You covenant, represent, warrant and agree that:
- each contribution that you submit is and shall be an original work of authorship and you can legally grant the rights set out in this MCA;
- to the best of your knowledge, each contribution will not violate any third party's copyrights, trademarks, patents, or other intellectual property rights; and
- each contribution shall be in compliance with U.S. export control laws and other applicable export and import laws.
You agree to notify us if you become aware of any circumstance which would make any of the foregoing representations inaccurate in any respect. Endless Labs, Inc. may publicly disclose your participation in the project, including the fact that you have signed the MCA.
6. This MCA is governed by the laws of the State of California and applicable U.S. Federal law. Any choice of law rules will not apply.
```
--------------------------------------------------------------------------------
/marker/schema/text/line.py:
--------------------------------------------------------------------------------
```python
import html
import re
from typing import Literal, List
import regex
from marker.schema import BlockTypes
from marker.schema.blocks import Block, BlockOutput
HYPHENS = r"-—¬"
def remove_tags(text):
return re.sub(r"<[^>]+>", "", text)
def replace_last(string, old, new):
matches = list(re.finditer(old, string))
if not matches:
return string
last_match = matches[-1]
return string[: last_match.start()] + new + string[last_match.end() :]
def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str:
lowercase_letters = r"\p{Ll}"
hyphen_regex = regex.compile(rf".*[{HYPHENS}]\s?$", regex.DOTALL)
next_line_starts_lowercase = regex.match(
rf"^\s?[{lowercase_letters}]", next_line_text
)
if hyphen_regex.match(line_text) and next_line_starts_lowercase:
line_html = replace_last(line_html, rf"[{HYPHENS}]", "")
return line_html
class Line(Block):
block_type: BlockTypes = BlockTypes.Line
block_description: str = "A line of text."
formats: List[Literal["math"]] | None = (
None # Sometimes we want to set math format at the line level, not span
)
def ocr_input_text(self, document):
text = ""
for block in self.contained_blocks(document, (BlockTypes.Span,)):
# We don't include superscripts/subscripts and math since they can be unreliable at this stage
block_text = block.text
if block.italic:
text += f"<i>{block_text}</i>"
elif block.bold:
text += f"<b>{block_text}</b>"
else:
text += block_text
return text.strip()
def formatted_text(self, document, skip_urls=False):
text = ""
for block in self.contained_blocks(document, (BlockTypes.Span,)):
block_text = html.escape(block.text)
if block.has_superscript:
block_text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", block_text)
if "<sup>" not in block_text:
block_text = f"<sup>{block_text}</sup>"
if block.url and not skip_urls:
block_text = f"<a href='{block.url}'>{block_text}</a>"
if block.italic:
text += f"<i>{block_text}</i>"
elif block.bold:
text += f"<b>{block_text}</b>"
elif block.math:
text += f"<math display='inline'>{block_text}</math>"
else:
text += block_text
return text
def assemble_html(self, document, child_blocks, parent_structure, block_config):
template = ""
for c in child_blocks:
template += c.html
raw_text = remove_tags(template).strip()
structure_idx = parent_structure.index(self.id)
if structure_idx < len(parent_structure) - 1:
next_block_id = parent_structure[structure_idx + 1]
next_line = document.get_block(next_block_id)
next_line_raw_text = next_line.raw_text(document)
template = strip_trailing_hyphens(raw_text, next_line_raw_text, template)
else:
template = template.strip(
" "
) # strip any trailing whitespace from the last line
return template
def render(
self, document, parent_structure, section_hierarchy=None, block_config=None
):
child_content = []
if self.structure is not None and len(self.structure) > 0:
for block_id in self.structure:
block = document.get_block(block_id)
child_content.append(
block.render(
document, parent_structure, section_hierarchy, block_config
)
)
return BlockOutput(
html=self.assemble_html(
document, child_content, parent_structure, block_config
),
polygon=self.polygon,
id=self.id,
children=[],
section_hierarchy=section_hierarchy,
)
def merge(self, other: "Line"):
self.polygon = self.polygon.merge([other.polygon])
# Handle merging structure with Nones
if self.structure is None:
self.structure = other.structure
elif other.structure is not None:
self.structure = self.structure + other.structure
# Merge formats with Nones
if self.formats is None:
self.formats = other.formats
elif other.formats is not None:
self.formats = list(set(self.formats + other.formats))
```
--------------------------------------------------------------------------------
/marker/processors/list.py:
--------------------------------------------------------------------------------
```python
from typing import Annotated, List, Tuple
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import ListItem
from marker.schema.document import Document
class ListProcessor(BaseProcessor):
"""
A processor for merging lists across pages and columns
"""
block_types = (BlockTypes.ListGroup,)
ignored_block_types: Annotated[
Tuple[BlockTypes],
"The list of block types to ignore when merging lists.",
] = (BlockTypes.PageHeader, BlockTypes.PageFooter)
min_x_indent: Annotated[
float, "The minimum horizontal indentation required to consider a block as a nested list item.",
"This is expressed as a percentage of the page width and is used to determine hierarchical relationships within a list.",
] = 0.01
def __init__(self, config):
super().__init__(config)
def __call__(self, document: Document):
self.list_group_continuation(document)
self.list_group_indentation(document)
def list_group_continuation(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
next_block = document.get_next_block(block, self.ignored_block_types)
if next_block is None:
continue
if next_block.block_type not in self.block_types:
continue
if next_block.structure is None:
continue
if next_block.ignore_for_output:
continue
column_break, page_break = False, False
next_block_in_first_quadrant = False
if next_block.page_id == block.page_id: # block on the same page
# we check for a column break
column_break = next_block.polygon.y_start <= block.polygon.y_end
else:
page_break = True
next_page = document.get_page(next_block.page_id)
next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
(next_block.polygon.y_start < next_page.polygon.height // 2)
block.has_continuation = column_break or (page_break and next_block_in_first_quadrant)
def list_group_indentation(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
if block.structure is None:
continue
if block.ignore_for_output:
continue
stack: List[ListItem] = [block.get_next_block(page, None)]
for list_item_id in block.structure:
list_item_block: ListItem = page.get_block(list_item_id)
# This can be a line sometimes
if list_item_block.block_type != BlockTypes.ListItem:
continue
while stack and list_item_block.polygon.x_start <= stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
stack.pop()
if stack and list_item_block.polygon.y_start > stack[-1].polygon.y_start:
list_item_block.list_indent_level = stack[-1].list_indent_level
if list_item_block.polygon.x_start > stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
list_item_block.list_indent_level += 1
next_list_item_block = block.get_next_block(page, list_item_block)
if next_list_item_block is not None and next_list_item_block.polygon.x_start > list_item_block.polygon.x_end:
stack = [next_list_item_block] # reset stack on column breaks
else:
stack.append(list_item_block)
stack: List[ListItem] = [block.get_next_block(page, None)]
for list_item_id in block.structure.copy():
list_item_block: ListItem = page.get_block(list_item_id)
while stack and list_item_block.list_indent_level <= stack[-1].list_indent_level:
stack.pop()
if stack:
current_parent = stack[-1]
current_parent.add_structure(list_item_block)
current_parent.polygon = current_parent.polygon.merge([list_item_block.polygon])
block.remove_structure_items([list_item_id])
stack.append(list_item_block)
```
--------------------------------------------------------------------------------
/marker/config/crawler.py:
--------------------------------------------------------------------------------
```python
import importlib
import inspect
import pkgutil
from functools import cached_property
from typing import Annotated, Dict, Set, Type, get_args, get_origin
from marker.builders import BaseBuilder
from marker.converters import BaseConverter
from marker.extractors import BaseExtractor
from marker.processors import BaseProcessor
from marker.providers import BaseProvider
from marker.renderers import BaseRenderer
from marker.services import BaseService
class ConfigCrawler:
def __init__(
self,
base_classes=(
BaseBuilder,
BaseProcessor,
BaseConverter,
BaseProvider,
BaseRenderer,
BaseService,
BaseExtractor,
),
):
self.base_classes = base_classes
self.class_config_map: Dict[str, dict] = {}
self._crawl_config()
def _crawl_config(self):
for base in self.base_classes:
base_class_type = base.__name__.removeprefix("Base")
self.class_config_map.setdefault(base_class_type, {})
for class_name, class_type in self._find_subclasses(base).items():
if class_name.startswith("Base"):
continue
self.class_config_map[base_class_type].setdefault(
class_name, {"class_type": class_type, "config": {}}
)
for attr, attr_type in self._gather_super_annotations(
class_type
).items():
default = getattr(class_type, attr)
metadata = (f"Default is {default}.",)
if get_origin(attr_type) is Annotated:
if any("Default" in desc for desc in attr_type.__metadata__):
metadata = attr_type.__metadata__
else:
metadata = attr_type.__metadata__ + metadata
attr_type = get_args(attr_type)[0]
formatted_type = self._format_type(attr_type)
self.class_config_map[base_class_type][class_name]["config"][
attr
] = (attr_type, formatted_type, default, metadata)
@staticmethod
def _gather_super_annotations(cls: Type) -> Dict[str, Type]:
"""
Collect all annotated attributes from `cls` and its superclasses, bottom-up.
Subclass attributes overwrite superclass attributes with the same name.
"""
# We'll walk the MRO from base -> derived so subclass attributes overwrite
# the same attribute name from superclasses.
annotations = {}
for base in reversed(cls.__mro__):
if base is object:
continue
if hasattr(base, "__annotations__"):
for name, annotation in base.__annotations__.items():
annotations[name] = annotation
return annotations
@cached_property
def attr_counts(self) -> Dict[str, int]:
counts: Dict[str, int] = {}
for base_type_dict in self.class_config_map.values():
for class_map in base_type_dict.values():
for attr in class_map["config"].keys():
counts[attr] = counts.get(attr, 0) + 1
return counts
@cached_property
def attr_set(self) -> Set[str]:
attr_set: Set[str] = set()
for base_type_dict in self.class_config_map.values():
for class_name, class_map in base_type_dict.items():
for attr in class_map["config"].keys():
attr_set.add(attr)
attr_set.add(f"{class_name}_{attr}")
return attr_set
def _find_subclasses(self, base_class):
subclasses = {}
module_name = base_class.__module__
package = importlib.import_module(module_name)
if hasattr(package, "__path__"):
for _, module_name, _ in pkgutil.walk_packages(
package.__path__, module_name + "."
):
try:
module = importlib.import_module(module_name)
for name, obj in inspect.getmembers(module, inspect.isclass):
if issubclass(obj, base_class) and obj is not base_class:
subclasses[name] = obj
except ImportError:
pass
return subclasses
def _format_type(self, t: Type) -> str:
"""Format a typing type like Optional[int] into a readable string."""
if get_origin(t): # Handle Optional and types with origins separately
return f"{t}".removeprefix("typing.")
else: # Regular types like int, str
return t.__name__
crawler = ConfigCrawler()
```
--------------------------------------------------------------------------------
/marker/processors/line_merge.py:
--------------------------------------------------------------------------------
```python
from typing import Annotated, List
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Block
from marker.schema.document import Document
from marker.schema.text import Line
from marker.util import matrix_intersection_area
class LineMergeProcessor(BaseProcessor):
"""
A processor for merging inline math lines.
"""
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath, BlockTypes.Caption, BlockTypes.Footnote, BlockTypes.SectionHeader)
min_merge_pct: Annotated[
float,
"The minimum percentage of intersection area to consider merging."
] = .015
block_expand_threshold: Annotated[
float,
"The percentage of the block width to expand the bounding box."
] = .05
min_merge_ydist: Annotated[
float,
"The minimum y distance between lines to consider merging."
] = 5
intersection_pct_threshold: Annotated[
float,
"The total amount of intersection area concentrated in the max intersection block."
] = .5
vertical_overlap_pct_threshold: Annotated[
float,
"The minimum percentage of vertical overlap to consider merging."
] = .8
use_llm: Annotated[
bool,
"Whether to use LLMs to improve accuracy."
] = False
def __init__(self, config):
super().__init__(config)
def merge_lines(self, lines: List[Line], block: Block):
lines = [l for l in lines if l.polygon.width * 5 > l.polygon.height] # Skip vertical lines
line_bboxes = [l.polygon.expand(self.block_expand_threshold, 0).bbox for l in lines] # Expand horizontally
intersections = matrix_intersection_area(line_bboxes, line_bboxes)
merges = []
merge = []
for i in range(len(line_bboxes)):
intersection_row = intersections[i]
intersection_row[i] = 0 # Zero out the current idx
if i < len(line_bboxes) - 1:
intersection_row[i+1] = 0 # Zero out the next idx, so we only evaluate merge from the left
if len(merge) == 0:
merge.append(i)
continue
# Zero out previous merge segments
merge_intersection = sum([intersection_row[m] for m in merge])
line_area = lines[i].polygon.area
intersection_pct = merge_intersection / max(1, line_area)
total_intersection = max(1, sum(intersection_row))
line_start = lines[merge[0]].polygon.y_start
line_end = lines[merge[0]].polygon.y_end
vertical_overlap_start = max(line_start, lines[i].polygon.y_start)
vertical_overlap_end = min(line_end, lines[i].polygon.y_end)
vertical_overlap = max(0, vertical_overlap_end - vertical_overlap_start)
vertical_overlap_pct = vertical_overlap / max(1, lines[i].polygon.height)
if all([
# Overlaps enough
intersection_pct >= self.min_merge_pct,
# Within same line
vertical_overlap_pct > self.vertical_overlap_pct_threshold,
# doesn't overlap with anything else
merge_intersection / total_intersection > self.intersection_pct_threshold
]):
merge.append(i)
else:
merges.append(merge)
merge = []
if merge:
merges.append(merge)
merges = [m for m in merges if len(m) > 1]
merged = set()
for merge in merges:
merge = [m for m in merge if m not in merged]
if len(merge) < 2:
continue
line: Line = lines[merge[0]]
merged.add(merge[0])
for idx in merge[1:]:
other_line: Line = lines[idx]
line.merge(other_line)
block.structure.remove(other_line.id)
other_line.removed = True # Mark line as removed
merged.add(idx)
# It is probably math if we are merging provider lines like this
if not line.formats:
line.formats = ["math"]
elif "math" not in line.formats:
line.formats.append("math")
def __call__(self, document: Document):
# Merging lines only needed for inline math
if not self.use_llm:
return
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
if block.structure is None:
continue
if not len(block.structure) >= 2: # Skip single lines
continue
lines = block.contained_blocks(document, (BlockTypes.Line,))
self.merge_lines(lines, block)
```
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
```python
import tempfile
from typing import Dict, Type
from PIL import Image, ImageDraw
import datasets
import pytest
from marker.builders.document import DocumentBuilder
from marker.builders.layout import LayoutBuilder
from marker.builders.line import LineBuilder
from marker.builders.ocr import OcrBuilder
from marker.builders.structure import StructureBuilder
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.providers.registry import provider_from_filepath
from marker.renderers.chunk import ChunkRenderer
from marker.renderers.html import HTMLRenderer
from marker.schema import BlockTypes
from marker.schema.blocks import Block
from marker.renderers.markdown import MarkdownRenderer
from marker.renderers.json import JSONRenderer
from marker.schema.registry import register_block_class
from marker.util import classes_to_strings, strings_to_classes
@pytest.fixture(scope="session")
def model_dict():
model_dict = create_model_dict()
yield model_dict
del model_dict
@pytest.fixture(scope="session")
def layout_model(model_dict):
yield model_dict["layout_model"]
@pytest.fixture(scope="session")
def detection_model(model_dict):
yield model_dict["detection_model"]
@pytest.fixture(scope="session")
def recognition_model(model_dict):
yield model_dict["recognition_model"]
@pytest.fixture(scope="session")
def table_rec_model(model_dict):
yield model_dict["table_rec_model"]
@pytest.fixture(scope="session")
def ocr_error_model(model_dict):
yield model_dict["ocr_error_model"]
@pytest.fixture(scope="function")
def config(request):
config_mark = request.node.get_closest_marker("config")
config = config_mark.args[0] if config_mark else {}
override_map: Dict[BlockTypes, Type[Block]] = config.get("override_map", {})
for block_type, override_block_type in override_map.items():
register_block_class(block_type, override_block_type)
return config
@pytest.fixture(scope="session")
def pdf_dataset():
return datasets.load_dataset("datalab-to/pdfs", split="train")
@pytest.fixture(scope="function")
def temp_doc(request, pdf_dataset):
filename_mark = request.node.get_closest_marker("filename")
filename = filename_mark.args[0] if filename_mark else "adversarial.pdf"
idx = pdf_dataset["filename"].index(filename)
suffix = filename.split(".")[-1]
temp_pdf = tempfile.NamedTemporaryFile(suffix=f".{suffix}")
temp_pdf.write(pdf_dataset["pdf"][idx])
temp_pdf.flush()
yield temp_pdf
@pytest.fixture(scope="function")
def doc_provider(request, config, temp_doc):
provider_cls = provider_from_filepath(temp_doc.name)
yield provider_cls(temp_doc.name, config)
@pytest.fixture(scope="function")
def pdf_document(
request,
config,
doc_provider,
layout_model,
ocr_error_model,
recognition_model,
detection_model,
):
layout_builder = LayoutBuilder(layout_model, config)
line_builder = LineBuilder(detection_model, ocr_error_model, config)
ocr_builder = OcrBuilder(recognition_model, config)
builder = DocumentBuilder(config)
structure_builder = StructureBuilder(config)
document = builder(doc_provider, layout_builder, line_builder, ocr_builder)
structure_builder(document)
yield document
@pytest.fixture(scope="function")
def pdf_converter(request, config, model_dict, renderer, llm_service):
if llm_service:
llm_service = classes_to_strings([llm_service])[0]
yield PdfConverter(
artifact_dict=model_dict,
processor_list=None,
renderer=classes_to_strings([renderer])[0],
config=config,
llm_service=llm_service,
)
@pytest.fixture(scope="function")
def renderer(request, config):
if request.node.get_closest_marker("output_format"):
output_format = request.node.get_closest_marker("output_format").args[0]
if output_format == "markdown":
return MarkdownRenderer
elif output_format == "json":
return JSONRenderer
elif output_format == "html":
return HTMLRenderer
elif output_format == "chunks":
return ChunkRenderer
else:
raise ValueError(f"Unknown output format: {output_format}")
else:
return MarkdownRenderer
@pytest.fixture(scope="function")
def llm_service(request, config):
llm_service = config.get("llm_service")
if not llm_service:
yield None
else:
yield strings_to_classes([llm_service])[0]
@pytest.fixture(scope="function")
def temp_image():
img = Image.new("RGB", (512, 512), color="white")
draw = ImageDraw.Draw(img)
draw.text((200, 200), "Hello, World!", fill="black", font_size=36)
with tempfile.NamedTemporaryFile(suffix=".png") as f:
img.save(f.name)
f.flush()
yield f
```
--------------------------------------------------------------------------------
/marker/extractors/document.py:
--------------------------------------------------------------------------------
```python
import json
from pydantic import BaseModel
from typing import Annotated, Optional, List
from marker.extractors import BaseExtractor
from marker.extractors.page import PageExtractionSchema
from marker.logger import get_logger
logger = get_logger()
class DocumentExtractionSchema(BaseModel):
analysis: str
document_json: str
class DocumentExtractor(BaseExtractor):
"""
An extractor that combines data from across all pages.
"""
page_schema: Annotated[
str,
"The JSON schema to be extracted from the page.",
] = ""
page_extraction_prompt = """You are an expert document analyst who reads documents and pulls data out in JSON format. You will receive your detailed notes from all the pages of a document, and a JSON schema that we want to extract from the document. Your task is to extract all the information properly into the JSON schema.
Some notes:
- The schema may contain a single object to extract from the entire document, or an array of objects.
- The schema may contain nested objects, arrays, and other complex structures.
Some guidelines:
- Some entities will span multiple pages, so make sure to consult your notes thoroughly.
- In the case of potential conflicting values, pull out the values you have the most confidence in, from your notes.
- If you cannot find a value for a field, leave it blank in the JSON.
**Instructions:**
1. Analyze your provided notes.
2. Analyze the JSON schema.
3. Write a detailed analysis of the notes, and the associated values in the schema. Make sure to reference which page each piece of information comes from.
4. Write the output in the JSON schema format, ensuring all required fields are filled out. Output only the json data, without any additional text or formatting.
**Example:**
Input:
Detailed Notes
Page 0
On this page, I see a table with car makes and sales. The makes are Honda and Toyota, with sales of 100 and 200 respectively. The color is not present in the table, so I will leave it blank in the JSON. That information may be present on another page. Some JSON snippets I may find useful later are:
```json
{
"make": "Honda",
"sales": 100,
}
```
```json
{
"make": "Toyota",
"sales": 200,
}
```
Honda is the first row in the table, and Toyota is the second row. Make is the first column, and sales is the second.
Page 1
I see a table that contains 2 rows, and has a color header. The first row has the color red, and the second row has the color blue. Here are some useful snippets:
Schema
```json
{'$defs': {'Cars': {'properties': {'make': {'title': 'Make', 'type': 'string'}, 'sales': {'title': 'Sales', 'type': 'integer'}, 'color': {'title': 'Color', 'type': 'string'}}, 'required': ['make', 'sales', 'color'], 'title': 'Cars', 'type': 'object'}}, 'properties': {'cars': {'items': {'$ref': '#/$defs/Cars'}, 'title': 'Cars', 'type': 'array'}}, 'required': ['cars'], 'title': 'CarsList', 'type': 'object'}
```
Output:
Analysis: From the notes, it looks like the information I need is in a table that spans 2 pages. The first page has the makes and sales, while the second page has the colors. I will combine this information into the JSON schema.
JSON
{
"cars": [
{
"make": "Honda",
"sales": 100,
"color": "red"
},
{
"make": "Toyota",
"sales": 200,
"color": "blue"
}
]
}
**Input:**
Detailed Notes
{{document_notes}}
Schema
```json
{{schema}}
```
"""
def assemble_document_notes(self, page_notes: List[PageExtractionSchema]) -> str:
notes = ""
for i, page_schema in enumerate(page_notes):
if not page_notes:
continue
notes += f"Page {i + 1}\n{page_schema.detailed_notes}\n\n"
return notes.strip()
def __call__(
self,
page_notes: List[PageExtractionSchema],
**kwargs,
) -> Optional[DocumentExtractionSchema]:
if not self.page_schema:
raise ValueError(
"Page schema must be defined for structured extraction to work."
)
prompt = self.page_extraction_prompt.replace(
"{{document_notes}}", self.assemble_document_notes(page_notes)
).replace("{{schema}}", json.dumps(self.page_schema))
response = self.llm_service(prompt, None, None, DocumentExtractionSchema)
logger.debug(f"Document extraction response: {response}")
if not response or any(
[
key not in response
for key in [
"analysis",
"document_json",
]
]
):
return None
json_data = response["document_json"].strip().lstrip("```json").rstrip("```")
return DocumentExtractionSchema(
analysis=response["analysis"], document_json=json_data
)
```
--------------------------------------------------------------------------------
/marker/scripts/server.py:
--------------------------------------------------------------------------------
```python
import traceback
import click
import os
from pydantic import BaseModel, Field
from starlette.responses import HTMLResponse
from marker.config.parser import ConfigParser
from marker.output import text_from_rendered
import base64
from contextlib import asynccontextmanager
from typing import Optional, Annotated
import io
from fastapi import FastAPI, Form, File, UploadFile
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.settings import settings
app_data = {}
UPLOAD_DIRECTORY = "./uploads"
os.makedirs(UPLOAD_DIRECTORY, exist_ok=True)
@asynccontextmanager
async def lifespan(app: FastAPI):
app_data["models"] = create_model_dict()
yield
if "models" in app_data:
del app_data["models"]
app = FastAPI(lifespan=lifespan)
@app.get("/")
async def root():
return HTMLResponse(
"""
<h1>Marker API</h1>
<ul>
<li><a href="/docs">API Documentation</a></li>
<li><a href="/marker">Run marker (post request only)</a></li>
</ul>
"""
)
class CommonParams(BaseModel):
filepath: Annotated[
Optional[str], Field(description="The path to the PDF file to convert.")
]
page_range: Annotated[
Optional[str],
Field(
description="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20",
example=None,
),
] = None
force_ocr: Annotated[
bool,
Field(
description="Force OCR on all pages of the PDF. Defaults to False. This can lead to worse results if you have good text in your PDFs (which is true in most cases)."
),
] = False
paginate_output: Annotated[
bool,
Field(
description="Whether to paginate the output. Defaults to False. If set to True, each page of the output will be separated by a horizontal rule that contains the page number (2 newlines, {PAGE_NUMBER}, 48 - characters, 2 newlines)."
),
] = False
output_format: Annotated[
str,
Field(
description="The format to output the text in. Can be 'markdown', 'json', or 'html'. Defaults to 'markdown'."
),
] = "markdown"
async def _convert_pdf(params: CommonParams):
assert params.output_format in ["markdown", "json", "html", "chunks"], (
"Invalid output format"
)
try:
options = params.model_dump()
config_parser = ConfigParser(options)
config_dict = config_parser.generate_config_dict()
config_dict["pdftext_workers"] = 1
converter_cls = PdfConverter
converter = converter_cls(
config=config_dict,
artifact_dict=app_data["models"],
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer(),
llm_service=config_parser.get_llm_service(),
)
rendered = converter(params.filepath)
text, _, images = text_from_rendered(rendered)
metadata = rendered.metadata
except Exception as e:
traceback.print_exc()
return {
"success": False,
"error": str(e),
}
encoded = {}
for k, v in images.items():
byte_stream = io.BytesIO()
v.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT)
encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(
settings.OUTPUT_ENCODING
)
return {
"format": params.output_format,
"output": text,
"images": encoded,
"metadata": metadata,
"success": True,
}
@app.post("/marker")
async def convert_pdf(params: CommonParams):
return await _convert_pdf(params)
@app.post("/marker/upload")
async def convert_pdf_upload(
page_range: Optional[str] = Form(default=None),
force_ocr: Optional[bool] = Form(default=False),
paginate_output: Optional[bool] = Form(default=False),
output_format: Optional[str] = Form(default="markdown"),
file: UploadFile = File(
..., description="The PDF file to convert.", media_type="application/pdf"
),
):
upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
with open(upload_path, "wb+") as upload_file:
file_contents = await file.read()
upload_file.write(file_contents)
params = CommonParams(
filepath=upload_path,
page_range=page_range,
force_ocr=force_ocr,
paginate_output=paginate_output,
output_format=output_format,
)
results = await _convert_pdf(params)
os.remove(upload_path)
return results
@click.command()
@click.option("--port", type=int, default=8000, help="Port to run the server on")
@click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")
def server_cli(port: int, host: str):
import uvicorn
# Run the server
uvicorn.run(
app,
host=host,
port=port,
)
```
--------------------------------------------------------------------------------
/marker/processors/equation.py:
--------------------------------------------------------------------------------
```python
from typing import Annotated, List, Tuple
from PIL import Image
import re
from bs4 import BeautifulSoup
from ftfy import fix_text, TextFixerConfig
from surya.recognition import RecognitionPredictor, OCRResult
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.settings import settings
MATH_TAG_PATTERN = re.compile(r"<math[^>]*>(.*?)</math>")
class EquationProcessor(BaseProcessor):
"""
A processor for recognizing equations in the document.
"""
block_types: Annotated[
Tuple[BlockTypes],
"The block types to process.",
] = (BlockTypes.Equation,)
model_max_length: Annotated[
int,
"The maximum number of tokens to allow for the Recognition model.",
] = 1024
equation_batch_size: Annotated[
int,
"The batch size to use for the recognition model while processing equations.",
"Default is None, which will use the default batch size for the model.",
] = None
disable_tqdm: Annotated[
bool,
"Whether to disable the tqdm progress bar.",
] = False
drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False
def __init__(self, recognition_model: RecognitionPredictor, config=None):
super().__init__(config)
self.recognition_model = recognition_model
def get_batch_size(self):
# Set to 1/4th of OCR batch size due to sequence length with tiling
if self.equation_batch_size is not None:
return self.equation_batch_size
elif settings.TORCH_DEVICE_MODEL == "cuda":
return 32
elif settings.TORCH_DEVICE_MODEL == "mps":
return 6
return 6
def __call__(self, document: Document):
images = []
equation_boxes = []
equation_block_ids = []
total_equation_blocks = 0
for page in document.pages:
page_image = page.get_image(highres=True)
page_size = page.polygon.width, page.polygon.height
image_size = page_image.size
page_equation_boxes = []
page_equation_block_ids = []
equation_blocks = page.contained_blocks(document, self.block_types)
for block in equation_blocks:
page_equation_boxes.append(
block.polygon.rescale(page_size, image_size).bbox
)
page_equation_block_ids.append(block.id)
total_equation_blocks += 1
images.append(page_image)
equation_boxes.append(page_equation_boxes)
equation_block_ids.append(page_equation_block_ids)
if total_equation_blocks == 0:
return
predictions = self.get_latex_batched(images, equation_boxes)
for page_predictions, page_equation_block_ids in zip(
predictions, equation_block_ids
):
assert len(page_predictions) == len(page_equation_block_ids), (
"Every equation block should have a corresponding prediction"
)
for block_prediction, block_id in zip(
page_predictions, page_equation_block_ids
):
block = document.get_block(block_id)
block.html = self.fix_latex(block_prediction)
def fix_latex(self, math_html: str):
math_html = math_html.strip()
soup = BeautifulSoup(math_html, "html.parser")
opening_math_tag = soup.find("math")
# No math block found
if not opening_math_tag:
return ""
# Force block format
opening_math_tag.attrs["display"] = "block"
fixed_math_html = str(soup)
# Sometimes model outputs newlines at the beginning/end of tags
fixed_math_html = re.sub(
r"^<math display=\"block\">\\n(?![a-zA-Z])",
'<math display="block">',
fixed_math_html,
)
fixed_math_html = re.sub(r"\\n</math>$", "</math>", fixed_math_html)
fixed_math_html = re.sub(r"<br>", "", fixed_math_html)
fixed_math_html = fix_text(
fixed_math_html, config=TextFixerConfig(unescape_html=True)
)
return fixed_math_html
def get_latex_batched(
self,
page_images: List[Image.Image],
bboxes: List[List[List[float]]],
):
self.recognition_model.disable_tqdm = self.disable_tqdm
predictions: List[OCRResult] = self.recognition_model(
images=page_images,
bboxes=bboxes,
task_names=["ocr_with_boxes"] * len(page_images),
recognition_batch_size=self.get_batch_size(),
sort_lines=False,
drop_repeated_text=self.drop_repeated_text,
max_tokens=2048,
max_sliding_window=2148,
)
equation_predictions = [
[line.text.strip() for line in page_prediction.text_lines]
for page_prediction in predictions
]
return equation_predictions
```
--------------------------------------------------------------------------------
/marker/processors/llm/llm_equation.py:
--------------------------------------------------------------------------------
```python
from pydantic import BaseModel
from marker.processors.llm import BaseLLMSimpleBlockProcessor, PromptData, BlockData
from marker.schema import BlockTypes
from marker.schema.document import Document
from typing import Annotated, List
class LLMEquationProcessor(BaseLLMSimpleBlockProcessor):
block_types = (BlockTypes.Equation,)
min_equation_height: Annotated[
float,
"The minimum ratio between equation height and page height to consider for processing.",
] = 0.06
image_expansion_ratio: Annotated[
float,
"The ratio to expand the image by when cropping.",
] = 0.05 # Equations sometimes get bboxes that are too tight
redo_inline_math: Annotated[
bool,
"Whether to redo inline math blocks.",
] = False
equation_latex_prompt: Annotated[
str,
"The prompt to use for generating LaTeX from equations.",
"Default is a string containing the Gemini prompt."
] = r"""You're an expert mathematician who is good at writing LaTeX code and html for equations.
You'll receive an image of a math block, along with the text extracted from the block. It may contain one or more equations. Your job is to write html that represents the content of the image, with the equations in LaTeX format.
Some guidelines:
- Output valid html, where all the equations can render properly.
- Use <math display="block"> as a block equation delimiter and <math> for inline equations. Do not use $ or $$ as delimiters.
- Keep the LaTeX code inside the math tags simple, concise, and KaTeX compatible.
- Enclose all equations in the correct math tags. Use multiple math tags inside the html to represent multiple equations.
- Only use the html tags math, i, b, p, and br.
- Make sure to include all the equations in the image in the html output.
- Make sure to include other text in the image in the correct positions along with the equations.
**Instructions:**
1. Carefully examine the provided image.
2. Analyze the existing html, which may include LaTeX code.
3. Write a short analysis of how the html should be corrected to represent the image.
4. If the html and LaTeX are correct, write "No corrections needed."
5. If the html and LaTeX are incorrect, generate the corrected html.
6. Output only the analysis, then the corrected html or "No corrections needed."
**Example:**
Input:
```html
The following equation illustrates the Pythagorean theorem:
x2 + y2 = z2
And this equation is a bit more complex:
(ab * x5 + x2 + 2 * x + 123)/t
```
Output:
analysis: The equations are not formatted as LaTeX, or enclosed in math tags.
```html
<p>The following equation illustrates the Pythagorean theorem:</p>
<math display="block">x^{2} + y^{2} = z^{2}</math>
<p>And this equation is a bit more complex, and contains <math>ab \cdot x^{5}</math>:</p>
<math display="block">\frac{ab \cdot x^{5} + x^{2} + 2 \cdot x + 123}{t}</math>
```
**Input:**
```html
{equation}
```
"""
def inference_blocks(self, document: Document) -> List[BlockData]:
blocks = super().inference_blocks(document)
out_blocks = []
for block_data in blocks:
block = block_data["block"]
page = block_data["page"]
# If we redo inline math, we redo all equations
if all([
block.polygon.height / page.polygon.height < self.min_equation_height,
not self.redo_inline_math
]):
continue
out_blocks.append(block_data)
return out_blocks
def block_prompts(self, document: Document) -> List[PromptData]:
prompt_data = []
for block_data in self.inference_blocks(document):
block = block_data["block"]
text = block.html if block.html else block.raw_text(document)
prompt = self.equation_latex_prompt.replace("{equation}", text)
image = self.extract_image(document, block)
prompt_data.append({
"prompt": prompt,
"image": image,
"block": block,
"schema": EquationSchema,
"page": block_data["page"]
})
return prompt_data
def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
block = prompt_data["block"]
text = block.html if block.html else block.raw_text(document)
if not response or "corrected_equation" not in response:
block.update_metadata(llm_error_count=1)
return
html_equation = response["corrected_equation"]
if "no corrections needed" in html_equation.lower():
return
balanced_tags = html_equation.count("<math") == html_equation.count("</math>")
if not all([
html_equation,
balanced_tags,
len(html_equation) > len(text) * .3,
]):
block.update_metadata(llm_error_count=1)
return
block.html = html_equation
class EquationSchema(BaseModel):
analysis: str
corrected_equation: str
```
--------------------------------------------------------------------------------
/marker/builders/structure.py:
--------------------------------------------------------------------------------
```python
from typing import Annotated
from marker.builders import BaseBuilder
from marker.schema import BlockTypes
from marker.schema.blocks import Text
from marker.schema.document import Document
from marker.schema.groups import ListGroup
from marker.schema.groups.page import PageGroup
from marker.schema.registry import get_block_class
class StructureBuilder(BaseBuilder):
"""
A builder for grouping blocks together based on their structure.
"""
gap_threshold: Annotated[
float,
"The minimum gap between blocks to consider them part of the same group.",
] = 0.05
list_gap_threshold: Annotated[
float,
"The minimum gap between list items to consider them part of the same group.",
] = 0.1
def __init__(self, config=None):
super().__init__(config)
def __call__(self, document: Document):
for page in document.pages:
self.group_caption_blocks(page)
self.group_lists(page)
self.unmark_lists(page)
def group_caption_blocks(self, page: PageGroup):
gap_threshold_px = self.gap_threshold * page.polygon.height
static_page_structure = page.structure.copy()
remove_ids = list()
for i, block_id in enumerate(static_page_structure):
block = page.get_block(block_id)
if block.block_type not in [BlockTypes.Table, BlockTypes.Figure, BlockTypes.Picture]:
continue
if block.id in remove_ids:
continue
block_structure = [block_id]
selected_polygons = [block.polygon]
caption_types = [BlockTypes.Caption, BlockTypes.Footnote]
prev_block = page.get_prev_block(block)
next_block = page.get_next_block(block)
if prev_block and \
prev_block.block_type in caption_types and \
prev_block.polygon.minimum_gap(block.polygon) < gap_threshold_px and \
prev_block.id not in remove_ids:
block_structure.insert(0, prev_block.id)
selected_polygons.append(prev_block.polygon)
if next_block and \
next_block.block_type in caption_types and \
next_block.polygon.minimum_gap(block.polygon) < gap_threshold_px:
block_structure.append(next_block.id)
selected_polygons.append(next_block.polygon)
if len(block_structure) > 1:
# Create a merged block
new_block_cls = get_block_class(BlockTypes[block.block_type.name + "Group"])
new_polygon = block.polygon.merge(selected_polygons)
group_block = page.add_block(new_block_cls, new_polygon)
group_block.structure = block_structure
# Update the structure of the page to reflect the new block
page.update_structure_item(block_id, group_block.id)
remove_ids.extend(block_structure)
page.remove_structure_items(remove_ids)
def group_lists(self, page: PageGroup):
gap_threshold_px = self.list_gap_threshold * page.polygon.height
static_page_structure = page.structure.copy()
remove_ids = list()
for i, block_id in enumerate(static_page_structure):
block = page.get_block(block_id)
if block.block_type not in [BlockTypes.ListItem]:
continue
if block.id in remove_ids:
continue
block_structure = [block_id]
selected_polygons = [block.polygon]
for j, next_block_id in enumerate(page.structure[i + 1:]):
next_block = page.get_block(next_block_id)
if all([
next_block.block_type == BlockTypes.ListItem,
next_block.polygon.minimum_gap(selected_polygons[-1]) < gap_threshold_px
]):
block_structure.append(next_block_id)
selected_polygons.append(next_block.polygon)
else:
break
if len(block_structure) > 1:
new_polygon = block.polygon.merge(selected_polygons)
group_block = page.add_block(ListGroup, new_polygon)
group_block.structure = block_structure
# Update the structure of the page to reflect the new block
page.update_structure_item(block_id, group_block.id)
remove_ids.extend(block_structure)
page.remove_structure_items(remove_ids)
def unmark_lists(self, page: PageGroup):
# If lists aren't grouped, unmark them as list items
for block_id in page.structure:
block = page.get_block(block_id)
if block.block_type == BlockTypes.ListItem:
generated_block = Text(
polygon=block.polygon,
page_id=block.page_id,
structure=block.structure,
)
page.replace_block(block, generated_block)
```
--------------------------------------------------------------------------------
/marker/services/gemini.py:
--------------------------------------------------------------------------------
```python
import json
import time
import traceback
from io import BytesIO
from typing import List, Annotated
import PIL
from google import genai
from google.genai import types
from google.genai.errors import APIError
from marker.logger import get_logger
from pydantic import BaseModel
from marker.schema.blocks import Block
from marker.services import BaseService
logger = get_logger()
class BaseGeminiService(BaseService):
gemini_model_name: Annotated[
str, "The name of the Google model to use for the service."
] = "gemini-2.0-flash"
thinking_budget: Annotated[
int, "The thinking token budget to use for the service."
] = None
def img_to_bytes(self, img: PIL.Image.Image):
image_bytes = BytesIO()
img.save(image_bytes, format="WEBP")
return image_bytes.getvalue()
def get_google_client(self, timeout: int):
raise NotImplementedError
def process_images(self, images):
image_parts = [
types.Part.from_bytes(data=self.img_to_bytes(img), mime_type="image/webp")
for img in images
]
return image_parts
def __call__(
self,
prompt: str,
image: PIL.Image.Image | List[PIL.Image.Image] | None,
block: Block | None,
response_schema: type[BaseModel],
max_retries: int | None = None,
timeout: int | None = None,
):
if max_retries is None:
max_retries = self.max_retries
if timeout is None:
timeout = self.timeout
client = self.get_google_client(timeout=timeout)
image_parts = self.format_image_for_llm(image)
total_tries = max_retries + 1
temperature = 0
for tries in range(1, total_tries + 1):
config = {
"temperature": temperature,
"response_schema": response_schema,
"response_mime_type": "application/json",
}
if self.max_output_tokens:
config["max_output_tokens"] = self.max_output_tokens
if self.thinking_budget is not None:
# For gemini models, we can optionally set a thinking budget in the config
config["thinking_config"] = types.ThinkingConfig(
thinking_budget=self.thinking_budget
)
try:
responses = client.models.generate_content(
model=self.gemini_model_name,
contents=image_parts
+ [
prompt
], # According to gemini docs, it performs better if the image is the first element
config=config,
)
output = responses.candidates[0].content.parts[0].text
total_tokens = responses.usage_metadata.total_token_count
if block:
block.update_metadata(
llm_tokens_used=total_tokens, llm_request_count=1
)
return json.loads(output)
except APIError as e:
if e.code in [429, 443, 503]:
# Rate limit exceeded
if tries == total_tries:
# Last attempt failed. Give up
logger.error(
f"APIError: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})",
)
break
else:
wait_time = tries * self.retry_wait_time
logger.warning(
f"APIError: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})",
)
time.sleep(wait_time)
else:
logger.error(f"APIError: {e}")
break
except json.JSONDecodeError as e:
temperature = 0.2 # Increase temperature slightly to try and get a different respons
# The response was not valid JSON
if tries == total_tries:
# Last attempt failed. Give up
logger.error(
f"JSONDecodeError: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})",
)
break
else:
logger.warning(
f"JSONDecodeError: {e}. Retrying... (Attempt {tries}/{total_tries})",
)
except Exception as e:
logger.error(f"Exception: {e}")
traceback.print_exc()
break
return {}
class GoogleGeminiService(BaseGeminiService):
gemini_api_key: Annotated[str, "The Google API key to use for the service."] = None
def get_google_client(self, timeout: int):
return genai.Client(
api_key=self.gemini_api_key,
http_options={"timeout": timeout * 1000}, # Convert to milliseconds
)
```
--------------------------------------------------------------------------------
/marker/processors/line_numbers.py:
--------------------------------------------------------------------------------
```python
from typing import Annotated
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
class LineNumbersProcessor(BaseProcessor):
"""
A processor for ignoring line numbers.
"""
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
strip_numbers_threshold: Annotated[
float,
"The fraction of lines or tokens in a block that must be numeric to consider them as line numbers.",
] = 0.6
min_lines_in_block: Annotated[
int,
"The minimum number of lines required in a block for it to be considered during processing.",
"Ensures that small blocks are ignored as they are unlikely to contain meaningful line numbers.",
] = 4
min_line_length: Annotated[
int,
"The minimum length of a line (in characters) to consider it significant when checking for",
"numeric prefixes or suffixes. Prevents false positives for short lines.",
] = 10
min_line_number_span_ratio: Annotated[
float,
"The minimum ratio of detected line number spans to total lines required to treat them as line numbers.",
] = .6
def __init__(self, config):
super().__init__(config)
def __call__(self, document: Document):
self.ignore_line_number_spans(document)
self.ignore_line_starts_ends(document)
self.ignore_line_number_blocks(document)
def ignore_line_number_spans(self, document: Document):
for page in document.pages:
line_count = 0
line_number_spans = []
for block in page.contained_blocks(document, (BlockTypes.Line,)):
if block.structure is None:
continue
line_count += 1
leftmost_span = None
for span in block.contained_blocks(document, (BlockTypes.Span,)):
if leftmost_span is None or span.polygon.x_start < leftmost_span.polygon.x_start:
leftmost_span = span
if leftmost_span is not None and leftmost_span.text.strip().isnumeric():
line_number_spans.append(leftmost_span)
if line_count > 0 and len(line_number_spans) / line_count > self.min_line_number_span_ratio:
for span in line_number_spans:
span.ignore_for_output = True
def ignore_line_number_blocks(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
raw_text = block.raw_text(document)
tokens = raw_text.strip().split()
if len(tokens) < 4:
continue
tokens_are_numbers = [token.isdigit() for token in tokens]
if all([
sum(tokens_are_numbers) / len(tokens) > self.strip_numbers_threshold,
block.polygon.height > block.polygon.width # Ensure block is taller than it is wide, like vertical page numbers
]):
block.ignore_for_output = True
def ignore_line_starts_ends(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
if block.structure is None:
continue
all_lines = block.structure_blocks(document)
if len(all_lines) < self.min_lines_in_block:
continue
starts_with_number = []
ends_with_number = []
for line in all_lines:
spans = line.structure_blocks(document)
if len(spans) < 2:
starts_with_number.append(False)
ends_with_number.append(False)
continue
raw_text = line.raw_text(document)
starts = all([
spans[0].text.strip().isdigit(),
len(raw_text) - len(spans[0].text.strip()) > self.min_line_length
])
ends = all([
spans[-1].text.strip().isdigit(),
len(raw_text) - len(spans[-1].text.strip()) > self.min_line_length
])
starts_with_number.append(starts)
ends_with_number.append(ends)
if sum(starts_with_number) / len(starts_with_number) > self.strip_numbers_threshold:
for starts, line in zip(starts_with_number, all_lines):
if starts:
span = page.get_block(line.structure[0])
span.ignore_for_output = True
if sum(ends_with_number) / len(ends_with_number) > self.strip_numbers_threshold:
for ends, line in zip(ends_with_number, all_lines):
if ends:
span = page.get_block(line.structure[-1])
span.ignore_for_output = True
```
--------------------------------------------------------------------------------
/marker/scripts/streamlit_app.py:
--------------------------------------------------------------------------------
```python
import os
from marker.scripts.common import (
load_models,
parse_args,
img_to_html,
get_page_image,
page_count,
)
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["IN_STREAMLIT"] = "true"
from marker.settings import settings
from streamlit.runtime.uploaded_file_manager import UploadedFile
import re
import tempfile
from typing import Any, Dict
import streamlit as st
from PIL import Image
from marker.converters.pdf import PdfConverter
from marker.config.parser import ConfigParser
from marker.output import text_from_rendered
def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[str, Any], dict):
config_dict = config_parser.generate_config_dict()
config_dict["pdftext_workers"] = 1
converter_cls = PdfConverter
converter = converter_cls(
config=config_dict,
artifact_dict=model_dict,
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer(),
llm_service=config_parser.get_llm_service(),
)
return converter(fname)
def markdown_insert_images(markdown, images):
image_tags = re.findall(
r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))',
markdown,
)
for image in image_tags:
image_markdown = image[0]
image_alt = image[1]
image_path = image[2]
if image_path in images:
markdown = markdown.replace(
image_markdown, img_to_html(images[image_path], image_alt)
)
return markdown
st.set_page_config(layout="wide")
col1, col2 = st.columns([0.5, 0.5])
model_dict = load_models()
cli_options = parse_args()
st.markdown("""
# Marker Demo
This app will let you try marker, a PDF or image -> Markdown, HTML, JSON converter. It works with any language, and extracts images, tables, equations, etc.
Find the project [here](https://github.com/VikParuchuri/marker).
""")
in_file: UploadedFile = st.sidebar.file_uploader(
"PDF, document, or image file:",
type=["pdf", "png", "jpg", "jpeg", "gif", "pptx", "docx", "xlsx", "html", "epub"],
)
if in_file is None:
st.stop()
filetype = in_file.type
with col1:
page_count = page_count(in_file)
page_number = st.number_input(
f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count
)
pil_image = get_page_image(in_file, page_number)
st.image(pil_image, use_container_width=True)
page_range = st.sidebar.text_input(
"Page range to parse, comma separated like 0,5-10,20",
value=f"{page_number}-{page_number}",
)
output_format = st.sidebar.selectbox(
"Output format", ["markdown", "json", "html", "chunks"], index=0
)
run_marker = st.sidebar.button("Run Marker")
use_llm = st.sidebar.checkbox(
"Use LLM", help="Use LLM for higher quality processing", value=False
)
force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
strip_existing_ocr = st.sidebar.checkbox(
"Strip existing OCR",
help="Strip existing OCR text from the PDF and re-OCR.",
value=False,
)
debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
disable_ocr_math = st.sidebar.checkbox(
"Disable math",
help="Disable math in OCR output - no inline math",
value=False,
)
if not run_marker:
st.stop()
# Run Marker
with tempfile.TemporaryDirectory() as tmp_dir:
temp_pdf = os.path.join(tmp_dir, "temp.pdf")
with open(temp_pdf, "wb") as f:
f.write(in_file.getvalue())
cli_options.update(
{
"output_format": output_format,
"page_range": page_range,
"force_ocr": force_ocr,
"debug": debug,
"output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
"use_llm": use_llm,
"strip_existing_ocr": strip_existing_ocr,
"disable_ocr_math": disable_ocr_math,
}
)
config_parser = ConfigParser(cli_options)
rendered = convert_pdf(temp_pdf, config_parser)
page_range = config_parser.generate_config_dict()["page_range"]
first_page = page_range[0] if page_range else 0
text, ext, images = text_from_rendered(rendered)
with col2:
if output_format == "markdown":
text = markdown_insert_images(text, images)
st.markdown(text, unsafe_allow_html=True)
elif output_format == "json":
st.json(text)
elif output_format == "html":
st.html(text)
elif output_format == "chunks":
st.json(text)
if debug:
with col1:
debug_data_path = rendered.metadata.get("debug_data_path")
if debug_data_path:
pdf_image_path = os.path.join(debug_data_path, f"pdf_page_{first_page}.png")
img = Image.open(pdf_image_path)
st.image(img, caption="PDF debug image", use_container_width=True)
layout_image_path = os.path.join(
debug_data_path, f"layout_page_{first_page}.png"
)
img = Image.open(layout_image_path)
st.image(img, caption="Layout debug image", use_container_width=True)
st.write("Raw output:")
st.code(text, language=output_format)
```
--------------------------------------------------------------------------------
/benchmarks/throughput/main.py:
--------------------------------------------------------------------------------
```python
import os
import tempfile
import time
from multiprocessing import get_context
from concurrent.futures import ProcessPoolExecutor
import torch
import click
import pypdfium2 as pdfium
from tqdm import tqdm
import datasets
def get_next_pdf(ds: datasets.Dataset, i: int):
while True:
pdf = ds[i]["pdf"]
filename = ds[i]["filename"]
if pdf and filename.endswith(".pdf"):
return pdf, filename, i + 1
i += 1
if i >= len(ds):
i = 0
def single_batch(
batch_size: int,
num_threads: int,
force_ocr: bool,
quantize: bool,
compile: bool,
worker_id: int,
chunksize: int = 100,
):
if quantize:
os.environ["RECOGNITION_MODEL_QUANTIZE"] = "true"
if compile:
os.environ["COMPILE_ALL"] = "true"
for item in [
"DETECTOR_POSTPROCESSING_CPU_WORKERS",
"OPENBLAS_NUM_THREADS",
"PDFTEXT_CPU_WORKERS",
"OMP_NUM_THREADS",
]:
os.environ[item] = f"{num_threads}"
torch.set_num_threads(num_threads)
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
ds = datasets.load_dataset("datalab-to/pdfs", split="train")
model_dict = create_model_dict()
torch.cuda.reset_peak_memory_stats()
times = []
i = 0
pages = 0
chars = 0
min_time = time.time()
for _ in range(batch_size):
pdf, fname, i = get_next_pdf(ds, i)
print(f"Inferencing {fname} on worker {worker_id}...")
pdf_doc = pdfium.PdfDocument(pdf)
page_count = len(pdf_doc)
pdf_doc.close()
pages += page_count
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
f.write(pdf)
f.flush()
page_range_chunks = list(range(0, page_count, chunksize))
for chunk_start in page_range_chunks:
chunk_end = min(chunk_start + chunksize, page_count)
page_range = list(range(chunk_start, chunk_end))
block_converter = PdfConverter(
artifact_dict=model_dict,
config={
"disable_tqdm": worker_id > 0,
"page_range": page_range,
"force_ocr": force_ocr,
},
)
start = time.time()
rendered = block_converter(f.name)
markdown, _, _ = text_from_rendered(rendered)
chars += len(markdown)
total = time.time() - start
times.append(total)
max_gpu_vram = torch.cuda.max_memory_reserved() / 1024**3
max_time = time.time()
return sum(times), min_time, max_time, max_gpu_vram, pages, chars
@click.command(help="Benchmark PDF to MD conversion throughput.")
@click.option("--workers", default=1, help="Number of workers to use.")
@click.option("--batch_size", default=1, help="Batch size for inference.")
@click.option("--force_ocr", is_flag=True, help="Force OCR on all pages.")
@click.option("--quantize", is_flag=True, help="Use quantized model.")
@click.option("--compile", is_flag=True, help="Use compiled model.")
def main(
workers: int,
batch_size: int,
force_ocr: bool,
quantize: bool,
compile: bool,
):
total_cpus = os.cpu_count()
start = time.time()
current_gpu_vram = torch.cuda.memory_reserved() / 1024**3
with ProcessPoolExecutor(
max_workers=workers, mp_context=get_context("spawn")
) as executor:
cpus_per_worker = min(8, max(2, total_cpus // workers))
futures = [
executor.submit(
single_batch,
batch_size,
cpus_per_worker,
force_ocr,
quantize,
compile,
i,
)
for i in range(workers)
]
all_times = []
min_time = None
max_time = time.time()
vrams = []
page_count = 0
char_count = 0
for future in tqdm(futures, desc="Running marker workers..."):
times, min_time_worker, max_time_worker, max_vram, pages, chars = (
future.result()
)
vrams.append(max_vram - current_gpu_vram)
all_times.append(times)
page_count += pages
char_count += chars
min_time = (
min(min_time_worker, min_time)
if min_time is not None
else min_time_worker
)
max_time = max(max_time, max_time_worker)
end = time.time() - start
all_worker_time = max_time - min_time
print(f"Average time per worker: {sum(all_times) / len(all_times)}")
print(f"Max time per worker: {max(all_times)}")
print(f"End to end time (counting model loading), all processes: {end}")
print(f"End to end time (no model loading), all processes: {all_worker_time}")
print(f"Total pages: {page_count}")
print(f"Total characters: {char_count}")
print(f"Time per page: {all_worker_time / page_count:.2f}")
print(f"Characters per second: {char_count / all_worker_time:.2f}")
print(f"Max GPU VRAM: {max(vrams):.2f} GB")
print(f"Average GPU VRAM: {sum(vrams) / len(vrams):.2f} GB")
if __name__ == "__main__":
main()
```
--------------------------------------------------------------------------------
/marker/renderers/__init__.py:
--------------------------------------------------------------------------------
```python
import base64
import io
import re
from collections import Counter
from typing import Annotated, Optional, Tuple, Literal
from bs4 import BeautifulSoup
from pydantic import BaseModel
from marker.schema import BlockTypes
from marker.schema.blocks.base import BlockId, BlockOutput
from marker.schema.document import Document
from marker.settings import settings
from marker.util import assign_config
class BaseRenderer:
image_blocks: Annotated[
Tuple[BlockTypes, ...], "The block types to consider as images."
] = (BlockTypes.Picture, BlockTypes.Figure)
extract_images: Annotated[bool, "Extract images from the document."] = True
image_extraction_mode: Annotated[
Literal["lowres", "highres"],
"The mode to use for extracting images.",
] = "highres"
keep_pageheader_in_output: Annotated[
bool, "Keep the page header in the output HTML."
] = False
keep_pagefooter_in_output: Annotated[
bool, "Keep the page footer in the output HTML."
] = False
add_block_ids: Annotated[bool, "Whether to add block IDs to the output HTML."] = (
False
)
def __init__(self, config: Optional[BaseModel | dict] = None):
assign_config(self, config)
self.block_config = {
"keep_pageheader_in_output": self.keep_pageheader_in_output,
"keep_pagefooter_in_output": self.keep_pagefooter_in_output,
"add_block_ids": self.add_block_ids,
}
def __call__(self, document):
# Children are in reading order
raise NotImplementedError
def extract_image(self, document: Document, image_id, to_base64=False):
image_block = document.get_block(image_id)
cropped = image_block.get_image(
document, highres=self.image_extraction_mode == "highres"
)
if to_base64:
image_buffer = io.BytesIO()
# RGBA to RGB
if not cropped.mode == "RGB":
cropped = cropped.convert("RGB")
cropped.save(image_buffer, format=settings.OUTPUT_IMAGE_FORMAT)
cropped = base64.b64encode(image_buffer.getvalue()).decode(
settings.OUTPUT_ENCODING
)
return cropped
@staticmethod
def merge_consecutive_math(html, tag="math"):
if not html:
return html
pattern = rf"-</{tag}>(\s*)<{tag}>"
html = re.sub(pattern, " ", html)
pattern = rf'-</{tag}>(\s*)<{tag} display="inline">'
html = re.sub(pattern, " ", html)
return html
@staticmethod
def merge_consecutive_tags(html, tag):
if not html:
return html
def replace_whitespace(match):
whitespace = match.group(1)
if len(whitespace) == 0:
return ""
else:
return " "
pattern = rf"</{tag}>(\s*)<{tag}>"
while True:
new_merged = re.sub(pattern, replace_whitespace, html)
if new_merged == html:
break
html = new_merged
return html
def generate_page_stats(self, document: Document, document_output):
page_stats = []
for page in document.pages:
block_counts = Counter(
[str(block.block_type) for block in page.children]
).most_common()
block_metadata = page.aggregate_block_metadata()
page_stats.append(
{
"page_id": page.page_id,
"text_extraction_method": page.text_extraction_method,
"block_counts": block_counts,
"block_metadata": block_metadata.model_dump(),
}
)
return page_stats
def generate_document_metadata(self, document: Document, document_output):
metadata = {
"table_of_contents": document.table_of_contents,
"page_stats": self.generate_page_stats(document, document_output),
}
if document.debug_data_path is not None:
metadata["debug_data_path"] = document.debug_data_path
return metadata
def extract_block_html(self, document: Document, block_output: BlockOutput):
soup = BeautifulSoup(block_output.html, "html.parser")
content_refs = soup.find_all("content-ref")
ref_block_id = None
images = {}
for ref in content_refs:
src = ref.get("src")
sub_images = {}
for item in block_output.children:
if item.id == src:
content, sub_images_ = self.extract_block_html(document, item)
sub_images.update(sub_images_)
ref_block_id: BlockId = item.id
break
if ref_block_id.block_type in self.image_blocks and self.extract_images:
images[ref_block_id] = self.extract_image(
document, ref_block_id, to_base64=True
)
else:
images.update(sub_images)
ref.replace_with(BeautifulSoup(content, "html.parser"))
if block_output.id.block_type in self.image_blocks and self.extract_images:
images[block_output.id] = self.extract_image(
document, block_output.id, to_base64=True
)
return str(soup), images
```
--------------------------------------------------------------------------------
/marker/processors/llm/llm_sectionheader.py:
--------------------------------------------------------------------------------
```python
import json
from typing import List, Tuple
from tqdm import tqdm
from marker.logger import get_logger
from marker.processors.llm import BaseLLMComplexBlockProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Block
from marker.schema.document import Document
from marker.schema.groups import PageGroup
from pydantic import BaseModel
logger = get_logger()
class LLMSectionHeaderProcessor(BaseLLMComplexBlockProcessor):
page_prompt = """You're a text correction expert specializing in accurately analyzing complex PDF documents. You will be given a list of all of the section headers from a document, along with their page number and approximate dimensions. The headers will be formatted like below, and will be presented in order.
```json
[
{
"bbox": [x1, y1, x2, y2],
"width": x2 - x1,
"height": y2 - y1,
"page": 0,
"id": "/page/0/SectionHeader/1",
"html": "<h1>Introduction</h1>",
}, ...
]
```
Bboxes have been normalized to 0-1000.
Your goal is to make sure that the section headers have the correct levels (h1, h2, h3, h4, h5, or h6). If a section header does not have the right level, edit the html to fix it.
Guidelines:
- Edit the blocks to ensure that the section headers have the correct levels.
- Only edit the h1, h2, h3, h4, h5, and h6 tags. Do not change any other tags or content in the headers.
- Only output the headers that changed (if nothing changed, output nothing).
- Every header you output needs to have one and only one level tag (h1, h2, h3, h4, h5, or h6).
**Instructions:**
1. Carefully examine the provided section headers and JSON.
2. Identify any changes you'll need to make, and write a short analysis.
3. Output "no_corrections", or "corrections_needed", depending on whether you need to make changes.
4. If corrections are needed, output any blocks that need updates. Only output the block ids and html, like this:
```json
[
{
"id": "/page/0/SectionHeader/1",
"html": "<h2>Introduction</h2>"
},
...
]
```
**Example:**
Input:
Section Headers
```json
[
{
"bbox": [x1, y1, x2, y2],
"id": "/page/0/SectionHeader/1",
"page": 0,
"html": "1 Vector Operations",
},
{
"bbox": [x1, y1, x2, y2],
"id": "/page/0/SectionHeader/2",
"page": 0,
"html": "1.1 Vector Addition",
},
]
```
Output:
Analysis: The first section header is missing the h1 tag, and the second section header is missing the h2 tag.
```json
[
{
"id": "/page/0/SectionHeader/1",
"html": "<h1>1 Vector Operations</h1>"
},
{
"id": "/page/0/SectionHeader/2",
"html": "<h2>1.1 Vector Addition</h2>"
}
]
```
**Input:**
Section Headers
```json
{{section_header_json}}
```
"""
def get_selected_blocks(
self,
document: Document,
page: PageGroup,
) -> List[dict]:
selected_blocks = page.structure_blocks(document)
json_blocks = [
self.normalize_block_json(block, document, page, i)
for i, block in enumerate(selected_blocks)
]
return json_blocks
def process_rewriting(
self, document: Document, section_headers: List[Tuple[Block, dict]]
):
section_header_json = [sh[1] for sh in section_headers]
for item in section_header_json:
_, _, page_id, block_type, block_id = item["id"].split("/")
item["page"] = page_id
item["width"] = item["bbox"][2] - item["bbox"][0]
item["height"] = item["bbox"][3] - item["bbox"][1]
del item["block_type"] # Not needed, since they're all section headers
prompt = self.page_prompt.replace(
"{{section_header_json}}", json.dumps(section_header_json)
)
response = self.llm_service(
prompt, None, document.pages[0], SectionHeaderSchema
)
logger.debug(f"Got section header reponse from LLM: {response}")
if not response or "correction_type" not in response:
logger.warning("LLM did not return a valid response")
return
correction_type = response["correction_type"]
if correction_type == "no_corrections":
return
self.load_blocks(response)
self.handle_rewrites(response["blocks"], document)
def load_blocks(self, response):
if isinstance(response["blocks"], str):
response["blocks"] = json.loads(response["blocks"])
def rewrite_blocks(self, document: Document):
# Don't show progress if there are no blocks to process
section_headers = [
(block, self.normalize_block_json(block, document, page))
for page in document.pages
for block in page.structure_blocks(document)
if block.block_type == BlockTypes.SectionHeader
]
if len(section_headers) == 0:
return
pbar = tqdm(
total=1,
desc=f"Running {self.__class__.__name__}",
disable=self.disable_tqdm,
)
self.process_rewriting(document, section_headers)
pbar.update(1)
pbar.close()
class BlockSchema(BaseModel):
id: str
html: str
class SectionHeaderSchema(BaseModel):
analysis: str
correction_type: str
blocks: List[BlockSchema]
```
--------------------------------------------------------------------------------
/marker/renderers/html.py:
--------------------------------------------------------------------------------
```python
import textwrap
from PIL import Image
from typing import Annotated, Tuple
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from pydantic import BaseModel
from marker.renderers import BaseRenderer
from marker.schema import BlockTypes
from marker.schema.blocks import BlockId
from marker.settings import settings
# Ignore beautifulsoup warnings
import warnings
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
# Suppress DecompressionBombError
Image.MAX_IMAGE_PIXELS = None
class HTMLOutput(BaseModel):
html: str
images: dict
metadata: dict
class HTMLRenderer(BaseRenderer):
"""
A renderer for HTML output.
"""
page_blocks: Annotated[
Tuple[BlockTypes],
"The block types to consider as pages.",
] = (BlockTypes.Page,)
paginate_output: Annotated[
bool,
"Whether to paginate the output.",
] = False
def extract_image(self, document, image_id):
image_block = document.get_block(image_id)
cropped = image_block.get_image(
document, highres=self.image_extraction_mode == "highres"
)
return cropped
def insert_block_id(self, soup, block_id: BlockId):
"""
Insert a block ID into the soup as a data attribute.
"""
if block_id.block_type in [BlockTypes.Line, BlockTypes.Span]:
return soup
if self.add_block_ids:
# Find the outermost tag (first tag that isn't a NavigableString)
outermost_tag = None
for element in soup.contents:
if hasattr(element, "name") and element.name:
outermost_tag = element
break
# If we found an outermost tag, add the data-block-id attribute
if outermost_tag:
outermost_tag["data-block-id"] = str(block_id)
# If soup only contains text or no tags, wrap in a span
elif soup.contents:
wrapper = soup.new_tag("span")
wrapper["data-block-id"] = str(block_id)
contents = list(soup.contents)
for content in contents:
content.extract()
wrapper.append(content)
soup.append(wrapper)
return soup
def extract_html(self, document, document_output, level=0):
soup = BeautifulSoup(document_output.html, "html.parser")
content_refs = soup.find_all("content-ref")
ref_block_id = None
images = {}
for ref in content_refs:
src = ref.get("src")
sub_images = {}
content = ""
for item in document_output.children:
if item.id == src:
content, sub_images_ = self.extract_html(document, item, level + 1)
sub_images.update(sub_images_)
ref_block_id: BlockId = item.id
break
if ref_block_id.block_type in self.image_blocks:
if self.extract_images:
image = self.extract_image(document, ref_block_id)
image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
images[image_name] = image
element = BeautifulSoup(
f"<p>{content}<img src='{image_name}'></p>", "html.parser"
)
ref.replace_with(self.insert_block_id(element, ref_block_id))
else:
# This will be the image description if using llm mode, or empty if not
element = BeautifulSoup(f"{content}", "html.parser")
ref.replace_with(self.insert_block_id(element, ref_block_id))
elif ref_block_id.block_type in self.page_blocks:
images.update(sub_images)
if self.paginate_output:
content = f"<div class='page' data-page-id='{ref_block_id.page_id}'>{content}</div>"
element = BeautifulSoup(f"{content}", "html.parser")
ref.replace_with(self.insert_block_id(element, ref_block_id))
else:
images.update(sub_images)
element = BeautifulSoup(f"{content}", "html.parser")
ref.replace_with(self.insert_block_id(element, ref_block_id))
output = str(soup)
if level == 0:
output = self.merge_consecutive_tags(output, "b")
output = self.merge_consecutive_tags(output, "i")
output = self.merge_consecutive_math(
output
) # Merge consecutive inline math tags
output = textwrap.dedent(f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
</head>
<body>
{output}
</body>
</html>
""")
return output, images
def __call__(self, document) -> HTMLOutput:
document_output = document.render(self.block_config)
full_html, images = self.extract_html(document, document_output)
soup = BeautifulSoup(full_html, "html.parser")
full_html = soup.prettify() # Add indentation to the HTML
return HTMLOutput(
html=full_html,
images=images,
metadata=self.generate_document_metadata(document, document_output),
)
```
--------------------------------------------------------------------------------
/marker/extractors/page.py:
--------------------------------------------------------------------------------
```python
import json
from concurrent.futures import ThreadPoolExecutor
from pydantic import BaseModel
from typing import Annotated, Optional, List
from tqdm import tqdm
from marker.extractors import BaseExtractor
from marker.logger import get_logger
logger = get_logger()
class PageExtractionSchema(BaseModel):
description: str
detailed_notes: str
class PageExtractor(BaseExtractor):
"""
An extractor that pulls data from a single page.
"""
extraction_page_chunk_size: Annotated[
int, "The number of pages to chunk together for extraction."
] = 3
page_schema: Annotated[
str,
"The JSON schema to be extracted from the page.",
] = ""
page_extraction_prompt = """You are an expert document analyst who reads documents and pulls data out in JSON format. You will receive the markdown representation of a document page, and a JSON schema that we want to extract from the document. Your task is to write detailed notes on this page, so that when you look at all your notes from across the document, you can fill in the schema.
Some notes:
- The schema may contain a single object to extract from the entire document, or an array of objects.
- The schema may contain nested objects, arrays, and other complex structures.
Some guidelines:
- Write very thorough notes, and include specific JSON snippets that can be extracted from the page.
- You may need information from prior or subsequent pages to fully fill in the schema, so make sure to write detailed notes that will let you join entities across pages later on.
- Estimate your confidence in the values you extract, so you can reconstruct the JSON later when you only have your notes.
- Some tables and other data structures may continue on a subsequent page, so make sure to store the positions that data comes from where appropriate.
**Instructions:**
1. Analyze the provided markdown representation of the page.
2. Analyze the JSON schema.
3. Write a short description of the fields in the schema, and the associated values in the markdown.
4. Write detailed notes on the page, including any values that can be extracted from the markdown. Include snippets of JSON that can be extracted from the page where possible.
**Example:**
Input:
Markdown
```markdown
| Make | Sales |
|--------|-------|
| Honda | 100 |
| Toyota | 200 |
```
Schema
```json
{'$defs': {'Cars': {'properties': {'make': {'title': 'Make', 'type': 'string'}, 'sales': {'title': 'Sales', 'type': 'integer'}, 'color': {'title': 'Color', 'type': 'string'}}, 'required': ['make', 'sales', 'color'], 'title': 'Cars', 'type': 'object'}}, 'properties': {'cars': {'items': {'$ref': '#/$defs/Cars'}, 'title': 'Cars', 'type': 'array'}}, 'required': ['cars'], 'title': 'CarsList', 'type': 'object'}
```
Output:
Description: The schema has a list of cars, each with a make, sales, and color. The image and markdown contain a table with 2 cars: Honda with 100 sales and Toyota with 200 sales. The color is not present in the table.
Detailed Notes: On this page, I see a table with car makes and sales. The makes are Honda and Toyota, with sales of 100 and 200 respectively. The color is not present in the table, so I will leave it blank in the JSON. That information may be present on another page. Some JSON snippets I may find useful later are:
```json
{
"make": "Honda",
"sales": 100,
}
```
```json
{
"make": "Toyota",
"sales": 200,
}
```
Honda is the first row in the table, and Toyota is the second row. Make is the first column, and sales is the second.
**Input:**
Markdown
```markdown
{{page_md}}
```
Schema
```json
{{schema}}
```
"""
def chunk_page_markdown(self, page_markdown: List[str]) -> List[str]:
"""
Chunk the page markdown into smaller pieces for processing.
"""
chunks = []
for i in range(0, len(page_markdown), self.extraction_page_chunk_size):
chunk = page_markdown[i : i + self.extraction_page_chunk_size]
chunks.append("\n\n".join(chunk))
return chunks
def inference_single_chunk(
self, page_markdown: str
) -> Optional[PageExtractionSchema]:
prompt = self.page_extraction_prompt.replace(
"{{page_md}}", page_markdown
).replace("{{schema}}", json.dumps(self.page_schema))
response = self.llm_service(prompt, None, None, PageExtractionSchema)
logger.debug(f"Page extraction response: {response}")
if not response or any(
[
key not in response
for key in [
"description",
"detailed_notes",
]
]
):
return None
return PageExtractionSchema(
description=response["description"],
detailed_notes=response["detailed_notes"],
)
def __call__(
self,
page_markdown: List[str],
**kwargs,
) -> List[PageExtractionSchema]:
if not self.page_schema:
raise ValueError(
"Page schema must be defined for structured extraction to work."
)
chunks = self.chunk_page_markdown(page_markdown)
results = []
pbar = tqdm(
desc="Running page extraction",
disable=self.disable_tqdm,
total=len(chunks),
)
with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
for future in [
executor.submit(self.inference_single_chunk, chunk) for chunk in chunks
]:
results.append(future.result()) # Raise exceptions if any occurred
pbar.update(1)
pbar.close()
return results
```
--------------------------------------------------------------------------------
/marker/scripts/common.py:
--------------------------------------------------------------------------------
```python
import ast
import base64
import io
import re
import sys
from typing import Optional
from PIL import Image
import click
import pypdfium2
import streamlit as st
from pydantic import BaseModel
from streamlit.runtime.uploaded_file_manager import UploadedFile
from marker.config.parser import ConfigParser
from marker.config.printer import CustomClickPrinter
from marker.models import create_model_dict
from marker.settings import settings
@st.cache_data()
def parse_args():
# Use to grab common cli options
@ConfigParser.common_options
def options_func():
pass
def extract_click_params(decorated_function):
if hasattr(decorated_function, "__click_params__"):
return decorated_function.__click_params__
return []
cmd = CustomClickPrinter("Marker app.")
extracted_params = extract_click_params(options_func)
cmd.params.extend(extracted_params)
ctx = click.Context(cmd)
try:
cmd_args = sys.argv[1:]
cmd.parse_args(ctx, cmd_args)
return ctx.params
except click.exceptions.ClickException as e:
return {"error": str(e)}
@st.cache_resource()
def load_models():
return create_model_dict()
def open_pdf(pdf_file):
stream = io.BytesIO(pdf_file.getvalue())
return pypdfium2.PdfDocument(stream)
def img_to_html(img, img_alt):
img_bytes = io.BytesIO()
img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT)
img_bytes = img_bytes.getvalue()
encoded = base64.b64encode(img_bytes).decode()
img_html = f'<img src="data:image/{settings.OUTPUT_IMAGE_FORMAT.lower()};base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
return img_html
@st.cache_data()
def get_page_image(pdf_file, page_num, dpi=96):
if "pdf" in pdf_file.type:
doc = open_pdf(pdf_file)
page = doc[page_num]
png_image = (
page.render(
scale=dpi / 72,
)
.to_pil()
.convert("RGB")
)
else:
png_image = Image.open(pdf_file).convert("RGB")
return png_image
@st.cache_data()
def page_count(pdf_file: UploadedFile):
if "pdf" in pdf_file.type:
doc = open_pdf(pdf_file)
return len(doc) - 1
else:
return 1
def pillow_image_to_base64_string(img: Image) -> str:
buffered = io.BytesIO()
img.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
def extract_root_pydantic_class(schema_code: str) -> Optional[str]:
try:
# Parse the code into an AST
tree = ast.parse(schema_code)
# Find all class definitions that inherit from BaseModel
class_names = set()
class_info = {} # Store information about each class
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef):
# Check if this class inherits from BaseModel
is_pydantic = False
for base in node.bases:
if isinstance(base, ast.Name) and base.id == "BaseModel":
is_pydantic = True
break
if is_pydantic:
class_names.add(node.name)
class_info[node.name] = {
"references": set(), # Classes this class references
"fields": [], # Field names in this class
}
# Extract field information
for item in node.body:
if isinstance(item, ast.AnnAssign) and isinstance(
item.target, ast.Name
):
field_name = item.target.id
class_info[node.name]["fields"].append(field_name)
# Check if this field references another class
annotation_str = ast.unparse(item.annotation)
# Look for List[ClassName], Optional[ClassName], Dict[Any, ClassName], etc.
for other_class in class_names:
pattern = rf"(?:List|Dict|Set|Tuple|Optional|Union)?\[.*{other_class}.*\]|{other_class}"
if re.search(pattern, annotation_str):
class_info[node.name]["references"].add(other_class)
if len(class_names) == 1:
return list(class_names)[0]
referenced_classes = set()
for class_name, info in class_info.items():
referenced_classes.update(info["references"])
# Find classes that reference others but aren't referenced themselves (potential roots)
root_candidates = set()
for class_name, info in class_info.items():
if info["references"] and class_name not in referenced_classes:
root_candidates.add(class_name)
# If we found exactly one root candidate, return it
if len(root_candidates) == 1:
return list(root_candidates)[0]
return None
except Exception as e:
print(f"Error parsing schema: {e}")
return None
def get_root_class(schema_code: str) -> Optional[BaseModel]:
root_class_name = extract_root_pydantic_class(schema_code)
if not root_class_name:
return None
if "from pydantic" not in schema_code:
schema_code = "from pydantic import BaseModel\n" + schema_code
if "from typing" not in schema_code:
schema_code = (
"from typing import List, Dict, Optional, Set, Tuple, Union, Any\n\n"
+ schema_code
)
# Execute the code in a new namespace
namespace = {}
exec(schema_code, namespace)
# Return the root class object
return namespace.get(root_class_name)
```