This is page 1 of 7. Use http://codebase.md/datalab-to/marker?page={x} to view the full context.
# Directory Structure
```
├── .github
│ ├── ISSUE_TEMPLATE
│ │ ├── breaking-bug-report.md
│ │ ├── feature_request.md
│ │ └── output-bug-report.md
│ └── workflows
│ ├── benchmarks.yml
│ ├── ci.yml
│ ├── cla.yml
│ ├── publish.yml
│ └── scripts.yml
├── .gitignore
├── .pre-commit-config.yaml
├── benchmarks
│ ├── __init__.py
│ ├── overall
│ │ ├── __init__.py
│ │ ├── display
│ │ │ ├── __init__.py
│ │ │ ├── dataset.py
│ │ │ └── table.py
│ │ ├── download
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── llamaparse.py
│ │ │ ├── main.py
│ │ │ ├── mathpix.py
│ │ │ └── mistral.py
│ │ ├── elo.py
│ │ ├── methods
│ │ │ ├── __init__.py
│ │ │ ├── docling.py
│ │ │ ├── gt.py
│ │ │ ├── llamaparse.py
│ │ │ ├── marker.py
│ │ │ ├── mathpix.py
│ │ │ ├── mistral.py
│ │ │ ├── olmocr.py
│ │ │ └── schema.py
│ │ ├── overall.py
│ │ ├── registry.py
│ │ ├── schema.py
│ │ └── scorers
│ │ ├── __init__.py
│ │ ├── clean.py
│ │ ├── heuristic.py
│ │ ├── llm.py
│ │ └── schema.py
│ ├── table
│ │ ├── __init__.py
│ │ ├── gemini.py
│ │ ├── inference.py
│ │ ├── scoring.py
│ │ └── table.py
│ ├── throughput
│ │ ├── __init__.py
│ │ └── main.py
│ └── verify_scores.py
├── chunk_convert.py
├── CLA.md
├── convert_single.py
├── convert.py
├── data
│ ├── .gitignore
│ ├── examples
│ │ ├── json
│ │ │ ├── multicolcnn.json
│ │ │ ├── switch_trans.json
│ │ │ └── thinkpython.json
│ │ └── markdown
│ │ ├── multicolcnn
│ │ │ ├── _page_1_Figure_0.jpeg
│ │ │ ├── _page_2_Picture_0.jpeg
│ │ │ ├── _page_6_Figure_0.jpeg
│ │ │ ├── _page_7_Figure_0.jpeg
│ │ │ ├── multicolcnn_meta.json
│ │ │ └── multicolcnn.md
│ │ ├── switch_transformers
│ │ │ ├── _page_11_Figure_4.jpeg
│ │ │ ├── _page_12_Figure_4.jpeg
│ │ │ ├── _page_13_Figure_2.jpeg
│ │ │ ├── _page_18_Figure_1.jpeg
│ │ │ ├── _page_18_Figure_3.jpeg
│ │ │ ├── _page_2_Figure_3.jpeg
│ │ │ ├── _page_20_Figure_1.jpeg
│ │ │ ├── _page_20_Figure_4.jpeg
│ │ │ ├── _page_27_Figure_1.jpeg
│ │ │ ├── _page_29_Figure_1.jpeg
│ │ │ ├── _page_30_Figure_1.jpeg
│ │ │ ├── _page_31_Figure_3.jpeg
│ │ │ ├── _page_4_Figure_1.jpeg
│ │ │ ├── _page_5_Figure_3.jpeg
│ │ │ ├── switch_trans_meta.json
│ │ │ └── switch_trans.md
│ │ └── thinkpython
│ │ ├── _page_109_Figure_1.jpeg
│ │ ├── _page_115_Figure_1.jpeg
│ │ ├── _page_116_Figure_3.jpeg
│ │ ├── _page_127_Figure_1.jpeg
│ │ ├── _page_128_Figure_1.jpeg
│ │ ├── _page_167_Figure_1.jpeg
│ │ ├── _page_169_Figure_1.jpeg
│ │ ├── _page_173_Figure_1.jpeg
│ │ ├── _page_190_Figure_1.jpeg
│ │ ├── _page_195_Figure_1.jpeg
│ │ ├── _page_205_Figure_1.jpeg
│ │ ├── _page_23_Figure_1.jpeg
│ │ ├── _page_23_Figure_3.jpeg
│ │ ├── _page_230_Figure_1.jpeg
│ │ ├── _page_233_Figure_1.jpeg
│ │ ├── _page_233_Figure_3.jpeg
│ │ ├── _page_234_Figure_1.jpeg
│ │ ├── _page_235_Figure_1.jpeg
│ │ ├── _page_236_Figure_1.jpeg
│ │ ├── _page_236_Figure_3.jpeg
│ │ ├── _page_237_Figure_1.jpeg
│ │ ├── _page_238_Figure_1.jpeg
│ │ ├── _page_46_Figure_1.jpeg
│ │ ├── _page_60_Figure_1.jpeg
│ │ ├── _page_60_Figure_3.jpeg
│ │ ├── _page_67_Figure_1.jpeg
│ │ ├── _page_71_Figure_1.jpeg
│ │ ├── _page_78_Figure_1.jpeg
│ │ ├── _page_85_Figure_1.jpeg
│ │ ├── _page_94_Figure_1.jpeg
│ │ ├── _page_99_Figure_17.jpeg
│ │ ├── _page_99_Figure_178.jpeg
│ │ ├── thinkpython_meta.json
│ │ └── thinkpython.md
│ ├── images
│ │ ├── overall.png
│ │ ├── per_doc.png
│ │ └── table.png
│ └── latex_to_md.sh
├── examples
│ ├── marker_modal_deployment.py
│ └── README.md
├── extraction_app.py
├── LICENSE
├── marker
│ ├── builders
│ │ ├── __init__.py
│ │ ├── document.py
│ │ ├── layout.py
│ │ ├── line.py
│ │ ├── ocr.py
│ │ └── structure.py
│ ├── config
│ │ ├── __init__.py
│ │ ├── crawler.py
│ │ ├── parser.py
│ │ └── printer.py
│ ├── converters
│ │ ├── __init__.py
│ │ ├── extraction.py
│ │ ├── ocr.py
│ │ ├── pdf.py
│ │ └── table.py
│ ├── extractors
│ │ ├── __init__.py
│ │ ├── document.py
│ │ └── page.py
│ ├── logger.py
│ ├── models.py
│ ├── output.py
│ ├── processors
│ │ ├── __init__.py
│ │ ├── blank_page.py
│ │ ├── block_relabel.py
│ │ ├── blockquote.py
│ │ ├── code.py
│ │ ├── debug.py
│ │ ├── document_toc.py
│ │ ├── equation.py
│ │ ├── footnote.py
│ │ ├── ignoretext.py
│ │ ├── line_merge.py
│ │ ├── line_numbers.py
│ │ ├── list.py
│ │ ├── llm
│ │ │ ├── __init__.py
│ │ │ ├── llm_complex.py
│ │ │ ├── llm_equation.py
│ │ │ ├── llm_form.py
│ │ │ ├── llm_handwriting.py
│ │ │ ├── llm_image_description.py
│ │ │ ├── llm_mathblock.py
│ │ │ ├── llm_meta.py
│ │ │ ├── llm_page_correction.py
│ │ │ ├── llm_sectionheader.py
│ │ │ ├── llm_table_merge.py
│ │ │ └── llm_table.py
│ │ ├── order.py
│ │ ├── page_header.py
│ │ ├── reference.py
│ │ ├── sectionheader.py
│ │ ├── table.py
│ │ ├── text.py
│ │ └── util.py
│ ├── providers
│ │ ├── __init__.py
│ │ ├── document.py
│ │ ├── epub.py
│ │ ├── html.py
│ │ ├── image.py
│ │ ├── pdf.py
│ │ ├── powerpoint.py
│ │ ├── registry.py
│ │ ├── spreadsheet.py
│ │ └── utils.py
│ ├── renderers
│ │ ├── __init__.py
│ │ ├── chunk.py
│ │ ├── extraction.py
│ │ ├── html.py
│ │ ├── json.py
│ │ ├── markdown.py
│ │ └── ocr_json.py
│ ├── schema
│ │ ├── __init__.py
│ │ ├── blocks
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── basetable.py
│ │ │ ├── caption.py
│ │ │ ├── code.py
│ │ │ ├── complexregion.py
│ │ │ ├── equation.py
│ │ │ ├── figure.py
│ │ │ ├── footnote.py
│ │ │ ├── form.py
│ │ │ ├── handwriting.py
│ │ │ ├── inlinemath.py
│ │ │ ├── listitem.py
│ │ │ ├── pagefooter.py
│ │ │ ├── pageheader.py
│ │ │ ├── picture.py
│ │ │ ├── reference.py
│ │ │ ├── sectionheader.py
│ │ │ ├── table.py
│ │ │ ├── tablecell.py
│ │ │ ├── text.py
│ │ │ └── toc.py
│ │ ├── document.py
│ │ ├── groups
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── figure.py
│ │ │ ├── list.py
│ │ │ ├── page.py
│ │ │ ├── picture.py
│ │ │ └── table.py
│ │ ├── polygon.py
│ │ ├── registry.py
│ │ └── text
│ │ ├── __init__.py
│ │ ├── char.py
│ │ ├── line.py
│ │ └── span.py
│ ├── scripts
│ │ ├── __init__.py
│ │ ├── chunk_convert.py
│ │ ├── chunk_convert.sh
│ │ ├── common.py
│ │ ├── convert_single.py
│ │ ├── convert.py
│ │ ├── extraction_app.py
│ │ ├── file_to_s3.py
│ │ ├── run_streamlit_app.py
│ │ ├── server.py
│ │ └── streamlit_app.py
│ ├── services
│ │ ├── __init__.py
│ │ ├── azure_openai.py
│ │ ├── claude.py
│ │ ├── gemini.py
│ │ ├── ollama.py
│ │ ├── openai.py
│ │ └── vertex.py
│ ├── settings.py
│ ├── util.py
│ └── utils
│ ├── __init__.py
│ ├── batch.py
│ ├── gpu.py
│ └── image.py
├── marker_app.py
├── marker_server.py
├── MODEL_LICENSE
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── README.md
├── signatures
│ └── version1
│ └── cla.json
├── static
│ └── fonts
│ └── .gitignore
└── tests
├── builders
│ ├── test_blank_page.py
│ ├── test_document_builder.py
│ ├── test_garbled_pdf.py
│ ├── test_layout_replace.py
│ ├── test_ocr_builder.py
│ ├── test_ocr_pipeline.py
│ ├── test_overriding.py
│ ├── test_pdf_links.py
│ ├── test_rotated_bboxes.py
│ ├── test_strip_existing_ocr.py
│ └── test_structure.py
├── config
│ └── test_config.py
├── conftest.py
├── converters
│ ├── test_extraction_converter.py
│ ├── test_ocr_converter.py
│ ├── test_pdf_converter.py
│ └── test_table_converter.py
├── processors
│ ├── test_document_toc_processor.py
│ ├── test_equation_processor.py
│ ├── test_footnote_processor.py
│ ├── test_ignoretext.py
│ ├── test_llm_processors.py
│ ├── test_table_merge.py
│ └── test_table_processor.py
├── providers
│ ├── test_document_providers.py
│ ├── test_image_provider.py
│ └── test_pdf_provider.py
├── renderers
│ ├── test_chunk_renderer.py
│ ├── test_extract_images.py
│ ├── test_html_renderer.py
│ ├── test_json_renderer.py
│ └── test_markdown_renderer.py
├── schema
│ └── groups
│ └── test_list_grouping.py
├── services
│ └── test_service_init.py
└── utils.py
```
# Files
--------------------------------------------------------------------------------
/static/fonts/.gitignore:
--------------------------------------------------------------------------------
```
*
!.gitignore
```
--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
```
latex
pdfs
references
```
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
```yaml
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.9.10
hooks:
# Run the linter.
- id: ruff
types_or: [ python, pyi ]
args: [ --fix ]
# Run the formatter.
- id: ruff-format
types_or: [ python, pyi ]
```
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
```
private.py
.DS_Store
local.env
experiments
test_data
training
wandb
*.dat
report.json
benchmark_data
debug_data
temp.md
temp
conversion_results
uploads
/cache
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
.vscode/
```
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
```markdown
## Usage Examples
This directory contains examples of running `marker` in different contexts.
### Usage with Modal
We have a [self-contained example](./marker_modal_deployment.py) that shows how you can quickly use [Modal](https://modal.com) to deploy `marker` by provisioning a container with a GPU, and expose that with an API so you can submit PDFs for conversion into Markdown, HTML, or JSON.
It's a limited example that you can extend into different use cases.
#### Pre-requisites
Make sure you have the `modal` client installed by [following their instructions here](https://modal.com/docs/guide#getting-started).
Modal's [Starter Plan](https://modal.com/pricing) includes $30 of free compute each month.
Modal is [serverless](https://arxiv.org/abs/1902.03383), so you only pay for resources when you are using them.
#### Running the example
Once `modal` is configured, you can deploy it to your workspace by running:
> modal deploy marker_modal_deployment.py
Notes:
- `marker` has a few models it uses. By default, the endpoint will check if these models are loaded and download them if not (first request will be slow). You can avoid this by running
> modal run marker_modal_deployment.py::download_models
Which will create a [`Modal Volume`](https://modal.com/docs/guide/Volumes) to store them for re-use.
Once the deploy is finished, you can:
- Test a file upload locally through your CLI using an `invoke_conversion` command we expose through Modal's [`local_entrypoint`](https://modal.com/docs/reference/modal.App#local_entrypoint)
- Get the URL of your endpoint and make a request through a client of your choice.
**Test from your CLI with `invoke_conversion`**
If your endpoint is live, simply run this command:
```
$ modal run marker_modal_deployment.py::invoke_conversion --pdf-file <PDF_FILE_PATH> --output-format markdown
```
And it'll automatically detect the URL of your new endpoint using [`.get_web_url()`](https://modal.com/docs/guide/webhook-urls#determine-the-url-of-a-web-endpoint-from-code), make sure it's healthy, submit your file, and store its output on your machine (in the same directory).
**Making a request using your own client**
If you want to make requests elsewhere e.g. with cURL or a client like Insomnia, you'll need to get the URL.
When your `modal deploy` command from earlier finishes, it'll include your endpoint URL at the end. For example:
```
$ modal deploy marker_modal_deployment.py
...
✓ Created objects.
├── 🔨 Created mount /marker/examples/marker_modal_deployment.py
├── 🔨 Created function download_models.
├── 🔨 Created function MarkerModalDemoService.*.
└── 🔨 Created web endpoint for MarkerModalDemoService.fastapi_app => <YOUR_ENDPOINT_URL>
✓ App deployed in 149.877s! 🎉
```
If you accidentally close your terminal session, you can also always go into Modal's dashboard and:
- Find the app (default name: `datalab-marker-modal-demo`)
- Click on `MarkerModalDemoService`
- Find your endpoint URL
Once you have your URL, make a request to `{YOUR_ENDPOINT_URL}/convert` like this (you can also use Insomnia, etc.):
```
curl --request POST \
--url {BASE_URL}/convert \
--header 'Content-Type: multipart/form-data' \
--form file=@/Users/cooldev/sample.pdf \
--form output_format=html
```
You should get a response like this
```
{
"success": true,
"filename": "sample.pdf",
"output_format": "html",
"json": null,
"html": "<YOUR_RESPONSE_CONTENT>",
"markdown": null,
"images": {},
"metadata": {... page level metadata ...},
"page_count": 2
}
```
[Modal](https://modal.com) makes deploying and scaling models and inference workloads much easier.
If you're interested in Datalab's managed API or on-prem document intelligence solution, check out [our platform here](https://datalab.to/?utm_source=gh-marker).
```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
# Marker
Marker converts documents to markdown, JSON, chunks, and HTML quickly and accurately.
- Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
- Formats tables, forms, equations, inline math, links, references, and code blocks
- Extracts and saves images
- Removes headers/footers/other artifacts
- Extensible with your own formatting and logic
- Does structured extraction, given a JSON schema (beta)
- Optionally boost accuracy with LLMs (and your own prompt)
- Works on GPU, CPU, or MPS
For our managed API or on-prem document intelligence solution, check out [our platform here](https://datalab.to?utm_source=gh-marker).
## Performance
<img src="data/images/overall.png" width="800px"/>
Marker benchmarks favorably compared to cloud services like Llamaparse and Mathpix, as well as other open source tools.
The above results are running single PDF pages serially. Marker is significantly faster when running in batch mode, with a projected throughput of 25 pages/second on an H100.
See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.
## Hybrid Mode
For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker. This will do things like merge tables across pages, handle inline math, format tables properly, and extract values from forms. It can use any gemini or ollama model. By default, it uses `gemini-2.0-flash`. See [below](#llm-services) for details.
Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm:
<img src="data/images/table.png" width="400px"/>
As you can see, the use_llm mode offers higher accuracy than marker or gemini alone.
## Examples
| PDF | File type | Markdown | JSON |
|-----|-----------|------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------|
| [Think Python](https://greenteapress.com/thinkpython/thinkpython.pdf) | Textbook | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/thinkpython/thinkpython.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/thinkpython.json) |
| [Switch Transformers](https://arxiv.org/pdf/2101.03961.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/switch_transformers/switch_trans.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/switch_trans.json) |
| [Multi-column CNN](https://arxiv.org/pdf/1804.07821.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/multicolcnn/multicolcnn.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/multicolcnn.json) |
# Commercial usage
Our model weights use a modified AI Pubs Open Rail-M license (free for research, personal use, and startups under $2M funding/revenue) and our code is GPL. For broader commercial licensing or to remove GPL requirements, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-marker).
# Hosted API & On-prem
There's a [hosted API](https://www.datalab.to?utm_source=gh-marker) and [painless on-prem solution](https://www.datalab.to/blog/self-serve-on-prem-licensing) for marker - it's free to sign up, and we'll throw in credits for you to test it out.
The API:
- Supports PDF, image, PPT, PPTX, DOC, DOCX, XLS, XLSX, HTML, EPUB files
- Is 1/4th the price of leading cloud-based competitors
- Fast - ~15s for a 250 page PDF
- Supports LLM mode
- High uptime (99.99%)
# Community
[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.
# Installation
You'll need python 3.10+ and [PyTorch](https://pytorch.org/get-started/locally/).
Install with:
```shell
pip install marker-pdf
```
If you want to use marker on documents other than PDFs, you will need to install additional dependencies with:
```shell
pip install marker-pdf[full]
```
# Usage
First, some configuration:
- Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
- Some PDFs, even digital ones, have bad text in them. Set `--force_ocr` to force OCR on all lines, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text.
- If you care about inline math, set `force_ocr` to convert inline math to LaTeX.
## Interactive App
I've included a streamlit app that lets you interactively try marker with some basic options. Run it with:
```shell
pip install streamlit streamlit-ace
marker_gui
```
## Convert a single file
```shell
marker_single /path/to/file.pdf
```
You can pass in PDFs or images.
Options:
- `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
- `--output_format [markdown|json|html|chunks]`: Specify the format for the output results.
- `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
- `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n`
- `--use_llm`: Uses an LLM to improve accuracy. You will need to configure the LLM backend - see [below](#llm-services).
- `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text. This will also format inline math properly.
- `--block_correction_prompt`: if LLM mode is active, an optional prompt that will be used to correct the output of marker. This is useful for custom formatting or logic that you want to apply to the output.
- `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya.
- `--redo_inline_math`: If you want the absolute highest quality inline math conversion, use this along with `--use_llm`.
- `--disable_image_extraction`: Don't extract images from the PDF. If you also specify `--use_llm`, then images will be replaced with a description.
- `--debug`: Enable debug mode for additional logging and diagnostic information.
- `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"`
- `--config_json PATH`: Path to a JSON configuration file containing additional settings.
- `config --help`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
- `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`. The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables.
- `--llm_service`: Which llm service to use if `--use_llm` is passed. This defaults to `marker.services.gemini.GoogleGeminiService`.
- `--help`: see all of the flags that can be passed into marker. (it supports many more options then are listed above)
The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py). If you don't need OCR, marker can work with any language.
## Convert multiple files
```shell
marker /path/to/input/folder
```
- `marker` supports all the same options from `marker_single` above.
- `--workers` is the number of conversion workers to run simultaneously. This is automatically set by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average.
## Convert multiple files on multiple GPUs
```shell
NUM_DEVICES=4 NUM_WORKERS=15 marker_chunk_convert ../pdf_in ../md_out
```
- `NUM_DEVICES` is the number of GPUs to use. Should be `2` or greater.
- `NUM_WORKERS` is the number of parallel processes to run on each GPU.
## Use from python
See the `PdfConverter` class at `marker/converters/pdf.py` function for additional arguments that can be passed.
```python
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
converter = PdfConverter(
artifact_dict=create_model_dict(),
)
rendered = converter("FILEPATH")
text, _, images = text_from_rendered(rendered)
```
`rendered` will be a pydantic basemodel with different properties depending on the output type requested. With markdown output (default), you'll have the properties `markdown`, `metadata`, and `images`. For json output, you'll have `children`, `block_type`, and `metadata`.
### Custom configuration
You can pass configuration using the `ConfigParser`. To see all available options, do `marker_single --help`.
```python
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser
config = {
"output_format": "json",
"ADDITIONAL_KEY": "VALUE"
}
config_parser = ConfigParser(config)
converter = PdfConverter(
config=config_parser.generate_config_dict(),
artifact_dict=create_model_dict(),
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer(),
llm_service=config_parser.get_llm_service()
)
rendered = converter("FILEPATH")
```
### Extract blocks
Each document consists of one or more pages. Pages contain blocks, which can themselves contain other blocks. It's possible to programmatically manipulate these blocks.
Here's an example of extracting all forms from a document:
```python
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.schema import BlockTypes
converter = PdfConverter(
artifact_dict=create_model_dict(),
)
document = converter.build_document("FILEPATH")
forms = document.contained_blocks((BlockTypes.Form,))
```
Look at the processors for more examples of extracting and manipulating blocks.
## Other converters
You can also use other converters that define different conversion pipelines:
### Extract tables
The `TableConverter` will only convert and extract tables:
```python
from marker.converters.table import TableConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
converter = TableConverter(
artifact_dict=create_model_dict(),
)
rendered = converter("FILEPATH")
text, _, images = text_from_rendered(rendered)
```
This takes all the same configuration as the PdfConverter. You can specify the configuration `force_layout_block=Table` to avoid layout detection and instead assume every page is a table. Set `output_format=json` to also get cell bounding boxes.
You can also run this via the CLI with
```shell
marker_single FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter --output_format json
```
### OCR Only
If you only want to run OCR, you can also do that through the `OCRConverter`. Set `--keep_chars` to keep individual characters and bounding boxes.
```python
from marker.converters.ocr import OCRConverter
from marker.models import create_model_dict
converter = OCRConverter(
artifact_dict=create_model_dict(),
)
rendered = converter("FILEPATH")
```
This takes all the same configuration as the PdfConverter.
You can also run this via the CLI with
```shell
marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter
```
### Structured Extraction (beta)
You can run structured extraction via the `ExtractionConverter`. This requires an llm service to be setup first (see [here](#llm-services) for details). You'll get a JSON output with the extracted values.
```python
from marker.converters.extraction import ExtractionConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser
from pydantic import BaseModel
class Links(BaseModel):
links: list[str]
schema = Links.model_json_schema()
config_parser = ConfigParser({
"page_schema": schema
})
converter = ExtractionConverter(
artifact_dict=create_model_dict(),
config=config_parser.generate_config_dict(),
llm_service=config_parser.get_llm_service(),
)
rendered = converter("FILEPATH")
```
Rendered will have an `original_markdown` field. If you pass this back in next time you run the converter, as the `existing_markdown` config key, you can skip re-parsing the document.
# Output Formats
## Markdown
Markdown output will include:
- image links (images will be saved in the same folder)
- formatted tables
- embedded LaTeX equations (fenced with `$$`)
- Code is fenced with triple backticks
- Superscripts for footnotes
## HTML
HTML output is similar to markdown output:
- Images are included via `img` tags
- equations are fenced with `<math>` tags
- code is in `pre` tags
## JSON
JSON output will be organized in a tree-like structure, with the leaf nodes being blocks. Examples of leaf nodes are a single list item, a paragraph of text, or an image.
The output will be a list, with each list item representing a page. Each page is considered a block in the internal marker schema. There are different types of blocks to represent different elements.
Pages have the keys:
- `id` - unique id for the block.
- `block_type` - the type of block. The possible block types can be seen in `marker/schema/__init__.py`. As of this writing, they are ["Line", "Span", "FigureGroup", "TableGroup", "ListGroup", "PictureGroup", "Page", "Caption", "Code", "Figure", "Footnote", "Form", "Equation", "Handwriting", "TextInlineMath", "ListItem", "PageFooter", "PageHeader", "Picture", "SectionHeader", "Table", "Text", "TableOfContents", "Document"]
- `html` - the HTML for the page. Note that this will have recursive references to children. The `content-ref` tags must be replaced with the child content if you want the full html. You can see an example of this at `marker/output.py:json_to_html`. That function will take in a single block from the json output, and turn it into HTML.
- `polygon` - the 4-corner polygon of the page, in (x1,y1), (x2,y2), (x3, y3), (x4, y4) format. (x1,y1) is the top left, and coordinates go clockwise.
- `children` - the child blocks.
The child blocks have two additional keys:
- `section_hierarchy` - indicates the sections that the block is part of. `1` indicates an h1 tag, `2` an h2, and so on.
- `images` - base64 encoded images. The key will be the block id, and the data will be the encoded image.
Note that child blocks of pages can have their own children as well (a tree structure).
```json
{
"id": "/page/10/Page/366",
"block_type": "Page",
"html": "<content-ref src='/page/10/SectionHeader/0'></content-ref><content-ref src='/page/10/SectionHeader/1'></content-ref><content-ref src='/page/10/Text/2'></content-ref><content-ref src='/page/10/Text/3'></content-ref><content-ref src='/page/10/Figure/4'></content-ref><content-ref src='/page/10/SectionHeader/5'></content-ref><content-ref src='/page/10/SectionHeader/6'></content-ref><content-ref src='/page/10/TextInlineMath/7'></content-ref><content-ref src='/page/10/TextInlineMath/8'></content-ref><content-ref src='/page/10/Table/9'></content-ref><content-ref src='/page/10/SectionHeader/10'></content-ref><content-ref src='/page/10/Text/11'></content-ref>",
"polygon": [[0.0, 0.0], [612.0, 0.0], [612.0, 792.0], [0.0, 792.0]],
"children": [
{
"id": "/page/10/SectionHeader/0",
"block_type": "SectionHeader",
"html": "<h1>Supplementary Material for <i>Subspace Adversarial Training</i> </h1>",
"polygon": [
[217.845703125, 80.630859375], [374.73046875, 80.630859375],
[374.73046875, 107.0],
[217.845703125, 107.0]
],
"children": null,
"section_hierarchy": {
"1": "/page/10/SectionHeader/1"
},
"images": {}
},
...
]
}
```
## Chunks
Chunks format is similar to JSON, but flattens everything into a single list instead of a tree. Only the top level blocks from each page show up. It also has the full HTML of each block inside, so you don't need to crawl the tree to reconstruct it. This enable flexible and easy chunking for RAG.
## Metadata
All output formats will return a metadata dictionary, with the following fields:
```json
{
"table_of_contents": [
{
"title": "Introduction",
"heading_level": 1,
"page_id": 0,
"polygon": [...]
}
], // computed PDF table of contents
"page_stats": [
{
"page_id": 0,
"text_extraction_method": "pdftext",
"block_counts": [("Span", 200), ...]
},
...
]
}
```
# LLM Services
When running with the `--use_llm` flag, you have a choice of services you can use:
- `Gemini` - this will use the Gemini developer API by default. You'll need to pass `--gemini_api_key` to configuration.
- `Google Vertex` - this will use vertex, which can be more reliable. You'll need to pass `--vertex_project_id`. To use it, set `--llm_service=marker.services.vertex.GoogleVertexService`.
- `Ollama` - this will use local models. You can configure `--ollama_base_url` and `--ollama_model`. To use it, set `--llm_service=marker.services.ollama.OllamaService`.
- `Claude` - this will use the anthropic API. You can configure `--claude_api_key`, and `--claude_model_name`. To use it, set `--llm_service=marker.services.claude.ClaudeService`.
- `OpenAI` - this supports any openai-like endpoint. You can configure `--openai_api_key`, `--openai_model`, and `--openai_base_url`. To use it, set `--llm_service=marker.services.openai.OpenAIService`.
- `Azure OpenAI` - this uses the Azure OpenAI service. You can configure `--azure_endpoint`, `--azure_api_key`, and `--deployment_name`. To use it, set `--llm_service=marker.services.azure_openai.AzureOpenAIService`.
These services may have additional optional configuration as well - you can see it by viewing the classes.
# Internals
Marker is easy to extend. The core units of marker are:
- `Providers`, at `marker/providers`. These provide information from a source file, like a PDF.
- `Builders`, at `marker/builders`. These generate the initial document blocks and fill in text, using info from the providers.
- `Processors`, at `marker/processors`. These process specific blocks, for example the table formatter is a processor.
- `Renderers`, at `marker/renderers`. These use the blocks to render output.
- `Schema`, at `marker/schema`. The classes for all the block types.
- `Converters`, at `marker/converters`. They run the whole end to end pipeline.
To customize processing behavior, override the `processors`. To add new output formats, write a new `renderer`. For additional input formats, write a new `provider.`
Processors and renderers can be directly passed into the base `PDFConverter`, so you can specify your own custom processing easily.
## API server
There is a very simple API server you can run like this:
```shell
pip install -U uvicorn fastapi python-multipart
marker_server --port 8001
```
This will start a fastapi server that you can access at `localhost:8001`. You can go to `localhost:8001/docs` to see the endpoint options.
You can send requests like this:
```
import requests
import json
post_data = {
'filepath': 'FILEPATH',
# Add other params here
}
requests.post("http://localhost:8001/marker", data=json.dumps(post_data)).json()
```
Note that this is not a very robust API, and is only intended for small-scale use. If you want to use this server, but want a more robust conversion option, you can use the hosted [Datalab API](https://www.datalab.to/plans).
# Troubleshooting
There are some settings that you may find useful if things aren't working the way you expect:
- If you have issues with accuracy, try setting `--use_llm` to use an LLM to improve quality. You must set `GOOGLE_API_KEY` to a Gemini API key for this to work.
- Make sure to set `force_ocr` if you see garbled text - this will re-OCR the document.
- `TORCH_DEVICE` - set this to force marker to use a given torch device for inference.
- If you're getting out of memory errors, decrease worker count. You can also try splitting up long PDFs into multiple files.
## Debugging
Pass the `debug` option to activate debug mode. This will save images of each page with detected layout and text, as well as output a json file with additional bounding box information.
# Benchmarks
## Overall PDF Conversion
We created a [benchmark set](https://huggingface.co/datasets/datalab-to/marker_benchmark) by extracting single PDF pages from common crawl. We scored based on a heuristic that aligns text with ground truth text segments, and an LLM as a judge scoring method.
| Method | Avg Time | Heuristic Score | LLM Score |
|------------|----------|-----------------|-----------|
| marker | 2.83837 | 95.6709 | 4.23916 |
| llamaparse | 23.348 | 84.2442 | 3.97619 |
| mathpix | 6.36223 | 86.4281 | 4.15626 |
| docling | 3.69949 | 86.7073 | 3.70429 |
Benchmarks were run on an H100 for markjer and docling - llamaparse and mathpix used their cloud services. We can also look at it by document type:
<img src="data/images/per_doc.png" width="1000px"/>
| Document Type | Marker heuristic | Marker LLM | Llamaparse Heuristic | Llamaparse LLM | Mathpix Heuristic | Mathpix LLM | Docling Heuristic | Docling LLM |
|----------------------|------------------|------------|----------------------|----------------|-------------------|-------------|-------------------|-------------|
| Scientific paper | 96.6737 | 4.34899 | 87.1651 | 3.96421 | 91.2267 | 4.46861 | 92.135 | 3.72422 |
| Book page | 97.1846 | 4.16168 | 90.9532 | 4.07186 | 93.8886 | 4.35329 | 90.0556 | 3.64671 |
| Other | 95.1632 | 4.25076 | 81.1385 | 4.01835 | 79.6231 | 4.00306 | 83.8223 | 3.76147 |
| Form | 88.0147 | 3.84663 | 66.3081 | 3.68712 | 64.7512 | 3.33129 | 68.3857 | 3.40491 |
| Presentation | 95.1562 | 4.13669 | 81.2261 | 4 | 83.6737 | 3.95683 | 84.8405 | 3.86331 |
| Financial document | 95.3697 | 4.39106 | 82.5812 | 4.16111 | 81.3115 | 4.05556 | 86.3882 | 3.8 |
| Letter | 98.4021 | 4.5 | 93.4477 | 4.28125 | 96.0383 | 4.45312 | 92.0952 | 4.09375 |
| Engineering document | 93.9244 | 4.04412 | 77.4854 | 3.72059 | 80.3319 | 3.88235 | 79.6807 | 3.42647 |
| Legal document | 96.689 | 4.27759 | 86.9769 | 3.87584 | 91.601 | 4.20805 | 87.8383 | 3.65552 |
| Newspaper page | 98.8733 | 4.25806 | 84.7492 | 3.90323 | 96.9963 | 4.45161 | 92.6496 | 3.51613 |
| Magazine page | 98.2145 | 4.38776 | 87.2902 | 3.97959 | 93.5934 | 4.16327 | 93.0892 | 4.02041 |
## Throughput
We benchmarked throughput using a [single long PDF](https://www.greenteapress.com/thinkpython/thinkpython.pdf).
| Method | Time per page | Time per document | VRAM used |
|---------|---------------|-------------------|---------- |
| marker | 0.18 | 43.42 | 3.17GB |
The projected throughput is 122 pages per second on an H100 - we can run 22 individual processes given the VRAM used.
## Table Conversion
Marker can extract tables from PDFs using `marker.converters.table.TableConverter`. The table extraction performance is measured by comparing the extracted HTML representation of tables against the original HTML representations using the test split of [FinTabNet](https://developer.ibm.com/exchanges/data/all/fintabnet/). The HTML representations are compared using a tree edit distance based metric to judge both structure and content. Marker detects and identifies the structure of all tables in a PDF page and achieves these scores:
| Method | Avg score | Total tables |
|------------------|-----------|--------------|
| marker | 0.816 | 99 |
| marker w/use_llm | 0.907 | 99 |
| gemini | 0.829 | 99 |
The `--use_llm` flag can significantly improve table recognition performance, as you can see.
We filter out tables that we cannot align with the ground truth, since fintabnet and our layout model have slightly different detection methods (this results in some tables being split/merged).
## Running your own benchmarks
You can benchmark the performance of marker on your machine. Install marker manually with:
```shell
git clone https://github.com/VikParuchuri/marker.git
poetry install
```
### Overall PDF Conversion
Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this:
```shell
python benchmarks/overall.py --methods marker --scores heuristic,llm
```
Options:
- `--use_llm` use an llm to improve the marker results.
- `--max_rows` how many rows to process for the benchmark.
- `--methods` can be `llamaparse`, `mathpix`, `docling`, `marker`. Comma separated.
- `--scores` which scoring functions to use, can be `llm`, `heuristic`. Comma separated.
### Table Conversion
The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with:
```shell
python benchmarks/table/table.py --max_rows 100
```
Options:
- `--use_llm` uses an llm with marker to improve accuracy.
- `--use_gemini` also benchmarks gemini 2.0 flash.
# How it works
Marker is a pipeline of deep learning models:
- Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya))
- Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya))
- Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify), [surya](https://github.com/VikParuchuri/surya))
- Optionally use an LLM to improve quality
- Combine blocks and postprocess complete text
It only uses models where necessary, which improves speed and accuracy.
# Limitations
PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address:
- Very complex layouts, with nested tables and forms, may not work
- Forms may not be rendered well
Note: Passing the `--use_llm` and `--force_ocr` flags will mostly solve these issues.
# Usage and Deployment Examples
You can always run `marker` locally, but if you wanted to expose it as an API, we have a few options:
- Our platform API which is powered by `marker` and `surya` and is easy to test out - it's free to sign up, and we'll include credits, [try it out here](https://datalab.to)
- Our painless on-prem solution for commercial use, which you can [read about here](https://www.datalab.to/blog/self-serve-on-prem-licensing) and gives you privacy guarantees with high throughput inference optimizations.
- [Deployment example with Modal](./examples/README_MODAL.md) that shows you how to deploy and access `marker` through a web endpoint using [`Modal`](https://modal.com). Modal is an AI compute platform that enables developers to deploy and scale models on GPUs in minutes.
```
--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/benchmarks/overall/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/benchmarks/overall/display/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/benchmarks/overall/download/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/benchmarks/table/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/benchmarks/throughput/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/marker/config/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/marker/scripts/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/marker/utils/__init__.py:
--------------------------------------------------------------------------------
```python
```
--------------------------------------------------------------------------------
/marker/schema/groups/base.py:
--------------------------------------------------------------------------------
```python
from marker.schema.blocks import Block
class Group(Block):
pass
```
--------------------------------------------------------------------------------
/marker/schema/text/__init__.py:
--------------------------------------------------------------------------------
```python
from marker.schema.text.line import Line
from marker.schema.text.span import Span
```
--------------------------------------------------------------------------------
/marker_server.py:
--------------------------------------------------------------------------------
```python
from marker.scripts.server import server_cli
if __name__ == "__main__":
server_cli()
```
--------------------------------------------------------------------------------
/convert.py:
--------------------------------------------------------------------------------
```python
from marker.scripts.convert import convert_cli
if __name__ == "__main__":
convert_cli()
```
--------------------------------------------------------------------------------
/chunk_convert.py:
--------------------------------------------------------------------------------
```python
from marker.scripts.chunk_convert import chunk_convert_cli
if __name__ == "__main__":
chunk_convert_cli()
```
--------------------------------------------------------------------------------
/convert_single.py:
--------------------------------------------------------------------------------
```python
from marker.scripts.convert_single import convert_single_cli
if __name__ == "__main__":
convert_single_cli()
```
--------------------------------------------------------------------------------
/marker_app.py:
--------------------------------------------------------------------------------
```python
from marker.scripts.run_streamlit_app import streamlit_app_cli
if __name__ == "__main__":
streamlit_app_cli()
```
--------------------------------------------------------------------------------
/extraction_app.py:
--------------------------------------------------------------------------------
```python
from marker.scripts.run_streamlit_app import extraction_app_cli
if __name__ == "__main__":
extraction_app_cli()
```
--------------------------------------------------------------------------------
/benchmarks/overall/methods/schema.py:
--------------------------------------------------------------------------------
```python
from typing import TypedDict, List
class BenchmarkResult(TypedDict):
markdown: str | List[str]
time: float | None
```
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
```
[pytest]
testpaths=tests
markers =
filename(name): specify the filename for the pdf_document fixture
filterwarnings =
ignore::Warning
```
--------------------------------------------------------------------------------
/benchmarks/overall/scorers/schema.py:
--------------------------------------------------------------------------------
```python
from typing import TypedDict, List, Optional, Dict
class BlockScores(TypedDict):
score: float
specific_scores: Dict[str, float | List[float]]
```
--------------------------------------------------------------------------------
/marker/schema/blocks/toc.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks.basetable import BaseTable
class TableOfContents(BaseTable):
block_type: str = BlockTypes.TableOfContents
block_description: str = "A table of contents."
```
--------------------------------------------------------------------------------
/marker/schema/text/char.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class Char(Block):
block_type: BlockTypes = BlockTypes.Char
block_description: str = "A single character inside a span."
text: str
idx: int
```
--------------------------------------------------------------------------------
/marker/providers/utils.py:
--------------------------------------------------------------------------------
```python
def alphanum_ratio(text):
text = text.replace(" ", "")
text = text.replace("\n", "")
alphanumeric_count = sum([1 for c in text if c.isalnum()])
if len(text) == 0:
return 1
ratio = alphanumeric_count / len(text)
return ratio
```
--------------------------------------------------------------------------------
/marker/schema/blocks/table.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks.basetable import BaseTable
class Table(BaseTable):
block_type: BlockTypes = BlockTypes.Table
block_description: str = "A table of data, like a results table. It will be in a tabular format."
```
--------------------------------------------------------------------------------
/benchmarks/overall/scorers/__init__.py:
--------------------------------------------------------------------------------
```python
from typing import List
from benchmarks.overall.scorers.schema import BlockScores
class BaseScorer:
def __init__(self):
pass
def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
raise NotImplementedError()
```
--------------------------------------------------------------------------------
/tests/builders/test_structure.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.builders.structure import StructureBuilder
@pytest.mark.config({"page_range": [0]})
def test_structure_builder(pdf_document):
structure = StructureBuilder()
structure(pdf_document)
assert len(pdf_document.pages[0].structure) > 0
```
--------------------------------------------------------------------------------
/marker/schema/groups/__init__.py:
--------------------------------------------------------------------------------
```python
from marker.schema.blocks.base import Block
from marker.schema.groups.figure import FigureGroup
from marker.schema.groups.table import TableGroup
from marker.schema.groups.list import ListGroup
from marker.schema.groups.picture import PictureGroup
from marker.schema.groups.page import PageGroup
```
--------------------------------------------------------------------------------
/marker/builders/__init__.py:
--------------------------------------------------------------------------------
```python
from typing import Optional
from pydantic import BaseModel
from marker.util import assign_config
class BaseBuilder:
def __init__(self, config: Optional[BaseModel | dict] = None):
assign_config(self, config)
def __call__(self, data, *args, **kwargs):
raise NotImplementedError
```
--------------------------------------------------------------------------------
/tests/builders/test_ocr_builder.py:
--------------------------------------------------------------------------------
```python
from PIL import Image
from marker.builders.ocr import OcrBuilder
def test_blank_char_builder(recognition_model):
builder = OcrBuilder(recognition_model)
image = Image.new("RGB", (100, 100))
spans = builder.spans_from_html_chars([], None, image) # Test with empty char list
assert len(spans) == 0
```
--------------------------------------------------------------------------------
/marker/schema/blocks/form.py:
--------------------------------------------------------------------------------
```python
from typing import List
from marker.schema import BlockTypes
from marker.schema.blocks.basetable import BaseTable
class Form(BaseTable):
block_type: BlockTypes = BlockTypes.Form
block_description: str = "A form, such as a tax form, that contains fields and labels. It most likely doesn't have a table structure."
```
--------------------------------------------------------------------------------
/tests/renderers/test_json_renderer.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.renderers.json import JSONRenderer
@pytest.mark.config({"page_range": [0]})
def test_markdown_renderer_pagination(pdf_document):
renderer = JSONRenderer()
pages = renderer(pdf_document).children
assert len(pages) == 1
assert pages[0].block_type == "Page"
assert pages[0].children[0].block_type == "SectionHeader"
```
--------------------------------------------------------------------------------
/benchmarks/overall/schema.py:
--------------------------------------------------------------------------------
```python
from typing import TypedDict, List, Dict
from benchmarks.overall.scorers.schema import BlockScores
AVG_TYPE = Dict[str, Dict[str, Dict[str, List[float]]]]
class FullResult(TypedDict):
scores: Dict[int, Dict[str, Dict[str, BlockScores]]]
averages_by_type: AVG_TYPE
averages_by_block_type: AVG_TYPE
average_times: Dict[str, List[float]]
markdown: Dict[int, Dict[str, str]]
```
--------------------------------------------------------------------------------
/tests/builders/test_strip_existing_ocr.py:
--------------------------------------------------------------------------------
```python
import pytest
@pytest.mark.config({"page_range": [0], "strip_existing_ocr": True})
@pytest.mark.filename("handwritten.pdf")
def test_strip_ocr(doc_provider):
# Ensure that the OCR text isn't extracted
assert len(doc_provider.page_lines) == 0
@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("handwritten.pdf")
def test_keep_ocr(doc_provider):
assert len(doc_provider.page_lines) == 1
```
--------------------------------------------------------------------------------
/tests/renderers/test_html_renderer.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.renderers.html import HTMLRenderer
@pytest.mark.config(
{
"page_range": [0],
"disable_ocr": True,
"add_block_ids": True,
"paginate_output": True,
}
)
def test_html_renderer_block_ids(pdf_document, config):
renderer = HTMLRenderer(config)
html = renderer(pdf_document).html
# Verify some block IDs are present
assert "/page/0/Text/1" in html
```
--------------------------------------------------------------------------------
/tests/processors/test_document_toc_processor.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.processors.document_toc import DocumentTOCProcessor
@pytest.mark.config({"page_range": [0]})
def test_document_toc_processor(pdf_document, detection_model, recognition_model, table_rec_model):
processor = DocumentTOCProcessor()
processor(pdf_document)
assert len(pdf_document.table_of_contents) == 4
assert pdf_document.table_of_contents[0]["title"] == "Subspace Adversarial Training"
```
--------------------------------------------------------------------------------
/tests/processors/test_equation_processor.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.schema import BlockTypes
from marker.processors.equation import EquationProcessor
@pytest.mark.config({"page_range": [0]})
def test_equation_processor(pdf_document, recognition_model):
processor = EquationProcessor(recognition_model)
processor(pdf_document)
for block in pdf_document.pages[0].children:
if block.block_type == BlockTypes.Equation:
assert block.html is not None
```
--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
```python
from marker.providers.pdf import PdfProvider
import tempfile
import datasets
def setup_pdf_provider(
filename='adversarial.pdf',
config=None,
) -> PdfProvider:
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
idx = dataset['filename'].index(filename)
temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
temp_pdf.write(dataset['pdf'][idx])
temp_pdf.flush()
provider = PdfProvider(temp_pdf.name, config)
return provider
```
--------------------------------------------------------------------------------
/marker/schema/blocks/code.py:
--------------------------------------------------------------------------------
```python
import html
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class Code(Block):
block_type: BlockTypes = BlockTypes.Code
code: str | None = None
html: str | None = None
block_description: str = "A programming code block."
def assemble_html(self, document, child_blocks, parent_structure, block_config):
if self.html:
return self.html
code = self.code or ""
return f"<pre>{html.escape(code)}</pre>"
```
--------------------------------------------------------------------------------
/marker/processors/__init__.py:
--------------------------------------------------------------------------------
```python
from typing import Optional, Tuple
from pydantic import BaseModel
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.util import assign_config
class BaseProcessor:
block_types: Tuple[BlockTypes] | None = None # What block types this processor is responsible for
def __init__(self, config: Optional[BaseModel | dict] = None):
assign_config(self, config)
def __call__(self, document: Document, *args, **kwargs):
raise NotImplementedError
```
--------------------------------------------------------------------------------
/tests/processors/test_footnote_processor.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.processors.footnote import FootnoteProcessor
from marker.schema import BlockTypes
@pytest.mark.filename("population_stats.pdf")
@pytest.mark.config({"page_range": [4]})
def test_footnote_processor(pdf_document):
processor = FootnoteProcessor()
processor(pdf_document)
page0_footnotes = pdf_document.pages[0].contained_blocks(pdf_document, [BlockTypes.Footnote])
assert len(page0_footnotes) >= 2
assert page0_footnotes[-1].raw_text(pdf_document).strip().startswith("5")
```
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
```markdown
---
name: Feature request
about: Suggest an idea for this project
title: "[FEAT]"
labels: enhancement
assignees: ''
---
## ✨ Is your feature request related to a problem?
A clear and concise description of what the problem is.
## 💡 Describe the Solution You'd Like
A concise description of what you want to happen or how you envision it working.
## 📋 Alternatives Considered
Any alternative solutions or workarounds you've tried.
## 🧩 Additional Context
Any additional context, references, or related issues.
```
--------------------------------------------------------------------------------
/marker/schema/blocks/reference.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class Reference(Block):
block_type: BlockTypes = BlockTypes.Reference
ref: str
block_description: str = "A reference to this block from another block."
def assemble_html(
self, document, child_blocks, parent_structure=None, block_config=None
):
template = super().assemble_html(
document, child_blocks, parent_structure, block_config
)
return f"<span id='{self.ref}'>{template}</span>"
```
--------------------------------------------------------------------------------
/marker/utils/batch.py:
--------------------------------------------------------------------------------
```python
from marker.utils.gpu import GPUManager
def get_batch_sizes_worker_counts(gpu_manager: GPUManager, peak_worker_vram: int):
vram = gpu_manager.get_gpu_vram()
workers = max(1, vram // peak_worker_vram)
if workers == 1:
return {}, workers
return {
"layout_batch_size": 12,
"detection_batch_size": 8,
"table_rec_batch_size": 12,
"ocr_error_batch_size": 12,
"recognition_batch_size": 64,
"equation_batch_size": 16,
"detector_postprocessing_cpu_workers": 2,
}, workers
```
--------------------------------------------------------------------------------
/tests/providers/test_pdf_provider.py:
--------------------------------------------------------------------------------
```python
import pytest
@pytest.mark.config({"page_range": [0]})
def test_pdf_provider(doc_provider):
assert len(doc_provider) == 12
assert doc_provider.get_images([0], 72)[0].size == (612, 792)
assert doc_provider.get_images([0], 96)[0].size == (816, 1056)
page_lines = doc_provider.get_page_lines(0)
assert len(page_lines) == 85
spans = page_lines[0].spans
assert len(spans) == 2
assert spans[0].text == "Subspace Adversarial Training"
assert spans[0].font == "NimbusRomNo9L-Medi"
assert spans[0].formats == ["plain"]
```
--------------------------------------------------------------------------------
/tests/providers/test_image_provider.py:
--------------------------------------------------------------------------------
```python
from marker.providers.image import ImageProvider
from marker.renderers.markdown import MarkdownOutput
def test_image_provider(config, temp_image):
provider = ImageProvider(temp_image.name, config)
assert len(provider) == 1
assert provider.get_images([0], 72)[0].size == (512, 512)
page_lines = provider.get_page_lines(0)
assert len(page_lines) == 0
def test_image_provider_conversion(pdf_converter, temp_image):
markdown_output: MarkdownOutput = pdf_converter(temp_image.name)
assert "Hello, World!" in markdown_output.markdown
```
--------------------------------------------------------------------------------
/marker/schema/groups/picture.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.groups.base import Group
class PictureGroup(Group):
block_type: BlockTypes = BlockTypes.PictureGroup
block_description: str = "A picture along with associated captions."
html: str | None = None
def assemble_html(
self, document, child_blocks, parent_structure, block_config=None
):
if self.html:
return self.html
child_html = super().assemble_html(
document, child_blocks, parent_structure, block_config
)
return child_html
```
--------------------------------------------------------------------------------
/benchmarks/overall/methods/mathpix.py:
--------------------------------------------------------------------------------
```python
import datasets
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
class MathpixMethod(BaseMethod):
mathpix_ds: datasets.Dataset = None
def __call__(self, sample) -> BenchmarkResult:
uuid = sample["uuid"]
data = None
for row in self.mathpix_ds:
if str(row["uuid"]) == str(uuid):
data = row
break
if not data:
raise ValueError(f"Could not find data for uuid {uuid}")
return {
"markdown": data["md"],
"time": data["time"]
}
```
--------------------------------------------------------------------------------
/benchmarks/overall/methods/mistral.py:
--------------------------------------------------------------------------------
```python
import datasets
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
class MistralMethod(BaseMethod):
mistral_ds: datasets.Dataset = None
def __call__(self, sample) -> BenchmarkResult:
uuid = sample["uuid"]
data = None
for row in self.mistral_ds:
if str(row["uuid"]) == str(uuid):
data = row
break
if not data:
raise ValueError(f"Could not find data for uuid {uuid}")
return {
"markdown": data["md"],
"time": data["time"]
}
```
--------------------------------------------------------------------------------
/marker/scripts/run_streamlit_app.py:
--------------------------------------------------------------------------------
```python
import subprocess
import os
import sys
def streamlit_app_cli(app_name: str = "streamlit_app.py"):
argv = sys.argv[1:]
cur_dir = os.path.dirname(os.path.abspath(__file__))
app_path = os.path.join(cur_dir, app_name)
cmd = [
"streamlit",
"run",
app_path,
"--server.fileWatcherType",
"none",
"--server.headless",
"true",
]
if argv:
cmd += ["--"] + argv
subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"})
def extraction_app_cli():
streamlit_app_cli("extraction_app.py")
```
--------------------------------------------------------------------------------
/marker/schema/groups/figure.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.groups.base import Group
class FigureGroup(Group):
block_type: BlockTypes = BlockTypes.FigureGroup
block_description: str = "A group that contains a figure and associated captions."
html: str | None = None
def assemble_html(
self, document, child_blocks, parent_structure, block_config=None
):
if self.html:
return self.html
child_html = super().assemble_html(
document, child_blocks, parent_structure, block_config
)
return child_html
```
--------------------------------------------------------------------------------
/benchmarks/overall/methods/llamaparse.py:
--------------------------------------------------------------------------------
```python
import datasets
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
class LlamaParseMethod(BaseMethod):
llamaparse_ds: datasets.Dataset = None
def __call__(self, sample) -> BenchmarkResult:
uuid = sample["uuid"]
data = None
for row in self.llamaparse_ds:
if str(row["uuid"]) == str(uuid):
data = row
break
if not data:
raise ValueError(f"Could not find data for uuid {uuid}")
return {
"markdown": data["md"],
"time": data["time"]
}
```
--------------------------------------------------------------------------------
/marker/schema/blocks/handwriting.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class Handwriting(Block):
block_type: BlockTypes = BlockTypes.Handwriting
block_description: str = "A region that contains handwriting."
html: str | None = None
replace_output_newlines: bool = True
def assemble_html(
self, document, child_blocks, parent_structure, block_config=None
):
if self.html:
return self.html
else:
return super().assemble_html(
document, child_blocks, parent_structure, block_config
)
```
--------------------------------------------------------------------------------
/tests/schema/groups/test_list_grouping.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.builders.structure import StructureBuilder
from marker.schema import BlockTypes
@pytest.mark.config({"page_range": [4]})
@pytest.mark.skip(reason="Model breaks this up due to equations")
def test_list_grouping(pdf_document):
structure = StructureBuilder()
structure(pdf_document)
page = pdf_document.pages[0]
list_groups = []
for block in page.children:
if block.block_type == BlockTypes.ListGroup:
list_groups.append(block)
# The model breaks this up, since it has equations in it
assert len(list_groups) == 3
```
--------------------------------------------------------------------------------
/tests/renderers/test_extract_images.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.renderers.markdown import MarkdownRenderer
@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("A17_FlightPlan.pdf")
def test_disable_extract_images(pdf_document):
renderer = MarkdownRenderer({"extract_images": False})
md = renderer(pdf_document).markdown
# Verify markdown
assert "jpeg" not in md
@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("A17_FlightPlan.pdf")
def test_extract_images(pdf_document):
renderer = MarkdownRenderer()
md = renderer(pdf_document).markdown
# Verify markdown
assert "jpeg" in md
```
--------------------------------------------------------------------------------
/marker/renderers/extraction.py:
--------------------------------------------------------------------------------
```python
from pydantic import BaseModel
from marker.extractors.document import DocumentExtractionSchema
from marker.renderers import BaseRenderer
class ExtractionOutput(BaseModel):
analysis: str
document_json: str
original_markdown: str
class ExtractionRenderer(BaseRenderer):
def __call__(
self, output: DocumentExtractionSchema, markdown: str
) -> ExtractionOutput:
# We definitely want to do more complex stuff here soon, so leave it in
return ExtractionOutput(
analysis=output.analysis,
document_json=output.document_json,
original_markdown=markdown,
)
```
--------------------------------------------------------------------------------
/marker/scripts/chunk_convert.py:
--------------------------------------------------------------------------------
```python
import argparse
import os
import subprocess
import pkg_resources
def chunk_convert_cli():
parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.")
parser.add_argument("in_folder", help="Input folder with pdfs.")
parser.add_argument("out_folder", help="Output folder")
args = parser.parse_args()
cur_dir = os.path.dirname(os.path.abspath(__file__))
script_path = os.path.join(cur_dir, "chunk_convert.sh")
# Construct the command
cmd = f"{script_path} {args.in_folder} {args.out_folder}"
# Execute the shell script
subprocess.run(cmd, shell=True, check=True)
```
--------------------------------------------------------------------------------
/tests/processors/test_ignoretext.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.processors.ignoretext import IgnoreTextProcessor
from marker.schema import BlockTypes
@pytest.mark.filename("bio_pdf.pdf")
@pytest.mark.config({"page_range": list(range(10))})
@pytest.mark.skip(reason="New layout model correctly identifies the block as a PageHeader, so nothing to be done by the IgnoreTextProcessor")
def test_ignoretext_processor(pdf_document):
processor = IgnoreTextProcessor()
processor(pdf_document)
page1_header = pdf_document.pages[1].contained_blocks(pdf_document, [BlockTypes.Text])[0]
assert "bioRxiv" in page1_header.raw_text(pdf_document)
assert page1_header.ignore_for_output is True
```
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
```yaml
name: CI tests
on: [push]
jobs:
tests:
runs-on: t4_gpu
steps:
- uses: actions/checkout@v3
- name: Install apt requirements
run: |
sudo apt-get update
sudo apt-get install -y libpango-1.0-0 libharfbuzz0b libpangoft2-1.0-0 libgdk-pixbuf2.0-0 libcairo2 libffi-dev shared-mime-info
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install python dependencies
run: |
pip install poetry
poetry install --extras "full"
- name: Run tests
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: poetry run pytest
```
--------------------------------------------------------------------------------
/marker/schema/blocks/footnote.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class Footnote(Block):
block_type: BlockTypes = BlockTypes.Footnote
block_description: str = (
"A footnote that explains a term or concept in the document."
)
replace_output_newlines: bool = True
html: str | None = None
def assemble_html(
self, document, child_blocks, parent_structure, block_config=None
):
if self.html:
return super().handle_html_output(
document, child_blocks, parent_structure, block_config
)
return super().assemble_html(
document, child_blocks, parent_structure, block_config
)
```
--------------------------------------------------------------------------------
/benchmarks/overall/methods/gt.py:
--------------------------------------------------------------------------------
```python
from typing import List
import json
from PIL import Image
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
class GTMethod(BaseMethod):
def __call__(self, sample) -> BenchmarkResult:
gt_blocks = json.loads(sample["gt_blocks"])
gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
gt_markdown = [self.convert_to_md(block) for block in gt_html]
return {
"markdown": gt_markdown,
"time": 0
}
def render(self, html: List[str]) -> Image.Image:
joined = "\n\n".join(html)
html = f"""
<html>
<head></head>
<body>
{joined}
</body>
</html>
""".strip()
return self.html_to_image(html)
```
--------------------------------------------------------------------------------
/marker/schema/blocks/caption.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class Caption(Block):
block_type: BlockTypes = BlockTypes.Caption
block_description: str = "A text caption that is directly above or below an image or table. Only used for text describing the image or table. "
replace_output_newlines: bool = True
html: str | None = None
def assemble_html(self, document, child_blocks, parent_structure, block_config):
if self.html:
return super().handle_html_output(
document, child_blocks, parent_structure, block_config
)
return super().assemble_html(
document, child_blocks, parent_structure, block_config
)
```
--------------------------------------------------------------------------------
/marker/schema/__init__.py:
--------------------------------------------------------------------------------
```python
from enum import auto, Enum
class BlockTypes(str, Enum):
Line = auto()
Span = auto()
Char = auto()
FigureGroup = auto()
TableGroup = auto()
ListGroup = auto()
PictureGroup = auto()
Page = auto()
Caption = auto()
Code = auto()
Figure = auto()
Footnote = auto()
Form = auto()
Equation = auto()
Handwriting = auto()
TextInlineMath = auto()
ListItem = auto()
PageFooter = auto()
PageHeader = auto()
Picture = auto()
SectionHeader = auto()
Table = auto()
Text = auto()
TableOfContents = auto()
Document = auto()
ComplexRegion = auto()
TableCell = auto()
Reference = auto()
def __str__(self):
return self.name
```
--------------------------------------------------------------------------------
/benchmarks/overall/methods/docling.py:
--------------------------------------------------------------------------------
```python
import tempfile
import time
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
class DoclingMethod(BaseMethod):
model_dict: dict = None
use_llm: bool = False
def __call__(self, sample) -> BenchmarkResult:
from docling.document_converter import DocumentConverter
pdf_bytes = sample["pdf"] # This is a single page PDF
converter = DocumentConverter()
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
f.write(pdf_bytes)
start = time.time()
result = converter.convert(f.name)
total = time.time() - start
return {
"markdown": result.document.export_to_markdown(),
"time": total
}
```
--------------------------------------------------------------------------------
/marker/schema/blocks/pagefooter.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class PageFooter(Block):
block_type: str = BlockTypes.PageFooter
block_description: str = (
"Text that appears at the bottom of a page, like a page number."
)
replace_output_newlines: bool = True
ignore_for_output: bool = True
html: str | None = None
def assemble_html(self, document, child_blocks, parent_structure, block_config):
if block_config and block_config.get("keep_pagefooter_in_output"):
self.ignore_for_output = False
if self.html and not self.ignore_for_output:
return self.html
return super().assemble_html(
document, child_blocks, parent_structure, block_config
)
```
--------------------------------------------------------------------------------
/marker/schema/blocks/pageheader.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class PageHeader(Block):
block_type: BlockTypes = BlockTypes.PageHeader
block_description: str = (
"Text that appears at the top of a page, like a page title."
)
replace_output_newlines: bool = True
ignore_for_output: bool = True
html: str | None = None
def assemble_html(self, document, child_blocks, parent_structure, block_config):
if block_config and block_config.get("keep_pageheader_in_output"):
self.ignore_for_output = False
if self.html and not self.ignore_for_output:
return self.html
return super().assemble_html(
document, child_blocks, parent_structure, block_config
)
```
--------------------------------------------------------------------------------
/marker/schema/groups/table.py:
--------------------------------------------------------------------------------
```python
from typing import List
from marker.schema import BlockTypes
from marker.schema.blocks import BlockOutput
from marker.schema.groups.base import Group
class TableGroup(Group):
block_type: BlockTypes = BlockTypes.TableGroup
block_description: str = "A table along with associated captions."
html: str | None = None
def assemble_html(
self,
document,
child_blocks: List[BlockOutput],
parent_structure=None,
block_config: dict | None = None,
):
if self.html:
return self.handle_html_output(
document, child_blocks, parent_structure, block_config
)
return super().assemble_html(
document, child_blocks, parent_structure, block_config
)
```
--------------------------------------------------------------------------------
/tests/builders/test_rotated_bboxes.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.schema import BlockTypes
@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("adversarial_rot.pdf")
def test_rotated_bboxes(pdf_document):
first_page = pdf_document.pages[0]
# Ensure we match all text lines up properly
text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
text_blocks = first_page.contained_blocks(
pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath)
)
# assert len(text_lines) == 84
# Ensure the bbox sizes match up
max_line_position = max([line.polygon.x_end for line in text_lines])
max_block_position = max(
[block.polygon.x_end for block in text_blocks if block.source == "layout"]
)
assert max_line_position <= max_block_position
```
--------------------------------------------------------------------------------
/marker/processors/document_toc.py:
--------------------------------------------------------------------------------
```python
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
class DocumentTOCProcessor(BaseProcessor):
"""
A processor for generating a table of contents for the document.
"""
block_types = (BlockTypes.SectionHeader, )
def __call__(self, document: Document):
toc = []
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
toc.append({
"title": block.raw_text(document).strip(),
"heading_level": block.heading_level,
"page_id": page.page_id,
"polygon": block.polygon.polygon
})
document.table_of_contents = toc
```
--------------------------------------------------------------------------------
/benchmarks/overall/registry.py:
--------------------------------------------------------------------------------
```python
from benchmarks.overall.methods.docling import DoclingMethod
from benchmarks.overall.methods.gt import GTMethod
from benchmarks.overall.methods.llamaparse import LlamaParseMethod
from benchmarks.overall.methods.marker import MarkerMethod
from benchmarks.overall.methods.mathpix import MathpixMethod
from benchmarks.overall.methods.mistral import MistralMethod
from benchmarks.overall.methods.olmocr import OlmOCRMethod
from benchmarks.overall.scorers.heuristic import HeuristicScorer
from benchmarks.overall.scorers.llm import LLMScorer
SCORE_REGISTRY = {
"heuristic": HeuristicScorer,
"llm": LLMScorer
}
METHOD_REGISTRY = {
"marker": MarkerMethod,
"gt": GTMethod,
"mathpix": MathpixMethod,
"llamaparse": LlamaParseMethod,
"docling": DoclingMethod,
"olmocr": OlmOCRMethod,
"mistral": MistralMethod
}
```
--------------------------------------------------------------------------------
/marker/processors/page_header.py:
--------------------------------------------------------------------------------
```python
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.schema.groups.page import PageGroup
class PageHeaderProcessor(BaseProcessor):
"""
A processor for moving PageHeaders to the top
"""
block_types = (BlockTypes.PageHeader,)
def __call__(self, document: Document):
for page in document.pages:
self.move_page_header_to_top(page, document)
def move_page_header_to_top(self, page: PageGroup, document: Document):
page_header_blocks = page.contained_blocks(document, self.block_types)
page_header_block_ids = [block.id for block in page_header_blocks]
for block_id in page_header_block_ids:
page.structure.remove(block_id)
page.structure[:0] = page_header_block_ids
```
--------------------------------------------------------------------------------
/marker/schema/groups/list.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.groups.base import Group
class ListGroup(Group):
block_type: BlockTypes = BlockTypes.ListGroup
has_continuation: bool = False
block_description: str = "A group of list items that should be rendered together."
html: str | None = None
def assemble_html(
self, document, child_blocks, parent_structure, block_config=None
):
if self.html:
return self.handle_html_output(
document, child_blocks, parent_structure, block_config
)
template = super().assemble_html(
document, child_blocks, parent_structure, block_config
)
el_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
el_attr += " class='has-continuation'"
return f"<p{el_attr}><ul>{template}</ul></p>"
```
--------------------------------------------------------------------------------
/benchmarks/overall/download/main.py:
--------------------------------------------------------------------------------
```python
import click
from benchmarks.overall.download.llamaparse import LlamaParseDownloader
from benchmarks.overall.download.mathpix import MathpixDownloader
from benchmarks.overall.download.mistral import MistralDownloader
@click.command("Download data from inference services")
@click.argument("service", type=click.Choice(["mathpix", "llamaparse", "mistral"]))
@click.option("--max_rows", type=int, default=2200)
@click.option("--api_key", type=str, default=None)
@click.option("--app_id", type=str, default=None)
def main(service: str, max_rows: int, api_key: str, app_id: str):
registry = {
"mathpix": MathpixDownloader,
"llamaparse": LlamaParseDownloader,
"mistral": MistralDownloader,
}
downloader = registry[service](api_key, app_id, max_rows=max_rows)
# Generate data and upload to hub
downloader()
if __name__ == "__main__":
main()
```
--------------------------------------------------------------------------------
/marker/logger.py:
--------------------------------------------------------------------------------
```python
import logging
import warnings
from marker.settings import settings
def configure_logging():
# Setup marker logger
logger = get_logger()
if not logger.handlers:
handler = logging.StreamHandler()
formatter = logging.Formatter(
"%(asctime)s [%(levelname)s] %(name)s: %(message)s"
)
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(settings.LOGLEVEL)
# Ignore future warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
# Set component loglevels
logging.getLogger("PIL").setLevel(logging.ERROR)
logging.getLogger("fontTools.subset").setLevel(logging.ERROR)
logging.getLogger("fontTools.ttLib.ttFont").setLevel(logging.ERROR)
logging.getLogger("weasyprint").setLevel(logging.CRITICAL)
def get_logger():
return logging.getLogger("marker")
```
--------------------------------------------------------------------------------
/tests/processors/test_table_merge.py:
--------------------------------------------------------------------------------
```python
from unittest.mock import Mock
import pytest
from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
from marker.processors.table import TableProcessor
from marker.schema import BlockTypes
@pytest.mark.filename("table_ex2.pdf")
def test_llm_table_processor_nomerge(pdf_document, table_rec_model, recognition_model, detection_model, mocker):
mock_cls = Mock()
mock_cls.return_value = {
"merge": "true",
"direction": "right"
}
cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model)
cell_processor(pdf_document)
tables = pdf_document.contained_blocks((BlockTypes.Table,))
assert len(tables) == 3
processor = LLMTableMergeProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"})
processor(pdf_document)
tables = pdf_document.contained_blocks((BlockTypes.Table,))
assert len(tables) == 3
```
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
```yaml
name: Python package
on:
push:
tags:
- "v*.*.*"
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install python dependencies
run: |
pip install poetry
poetry install --extras "full"
- name: Build package
run: |
poetry build
- name: Extract version from pyproject.toml
id: version
run: |
VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['tool']['poetry']['version'])")
echo "version=v$VERSION" >> $GITHUB_OUTPUT
- name: Publish package
env:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
run: |
poetry config pypi-token.pypi "$PYPI_TOKEN"
poetry publish
```
--------------------------------------------------------------------------------
/marker/schema/blocks/sectionheader.py:
--------------------------------------------------------------------------------
```python
from typing import Optional
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class SectionHeader(Block):
block_type: BlockTypes = BlockTypes.SectionHeader
heading_level: Optional[int] = None
block_description: str = "The header of a section of text or other blocks."
html: str | None = None
def assemble_html(
self, document, child_blocks, parent_structure, block_config=None
):
if self.ignore_for_output:
return ""
if self.html:
return super().handle_html_output(
document, child_blocks, parent_structure, block_config
)
template = super().assemble_html(
document, child_blocks, parent_structure, block_config
)
template = template.replace("\n", " ")
tag = f"h{self.heading_level}" if self.heading_level else "h2"
return f"<{tag}>{template}</{tag}>"
```
--------------------------------------------------------------------------------
/tests/renderers/test_chunk_renderer.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.renderers.chunk import ChunkRenderer
@pytest.mark.config({"page_range": [0]})
def test_chunk_renderer(pdf_document):
renderer = ChunkRenderer()
chunk_output = renderer(pdf_document)
blocks = chunk_output.blocks
page_info = chunk_output.page_info
assert len(blocks) == 14
assert blocks[0].block_type == "SectionHeader"
assert page_info[0]["bbox"] is not None
assert page_info[0]["polygon"] is not None
figure_groups = [block for block in blocks if block.block_type == "FigureGroup"]
figures = [block for block in blocks if block.block_type == "Figure"]
captions = [block for block in blocks if block.block_type == "Caption"]
assert len(figure_groups) == 1
assert len(figures) == 0
assert len(captions) == 0
figure_group = figure_groups[0]
assert figure_group.images is not None
assert len(figure_group.images) == 1
assert "<img src='/page/0/Figure/9'>" in figure_group.html
```
--------------------------------------------------------------------------------
/marker/providers/html.py:
--------------------------------------------------------------------------------
```python
import os
import tempfile
from marker.providers.pdf import PdfProvider
class HTMLProvider(PdfProvider):
def __init__(self, filepath: str, config=None):
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
self.temp_pdf_path = temp_pdf.name
temp_pdf.close()
# Convert HTML to PDF
try:
self.convert_html_to_pdf(filepath)
except Exception as e:
raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")
# Initialize the PDF provider with the temp pdf path
super().__init__(self.temp_pdf_path, config)
def __del__(self):
if os.path.exists(self.temp_pdf_path):
os.remove(self.temp_pdf_path)
def convert_html_to_pdf(self, filepath: str):
from weasyprint import HTML
font_css = self.get_font_css()
HTML(filename=filepath, encoding="utf-8").write_pdf(
self.temp_pdf_path, stylesheets=[font_css]
)
```
--------------------------------------------------------------------------------
/data/latex_to_md.sh:
--------------------------------------------------------------------------------
```bash
#!/bin/bash
# List all .tex files in the latex folder
FILES=$(find latex -name "*.tex")
for f in $FILES
do
echo "Processing $f file..."
base_name=$(basename "$f" .tex)
out_file="references/${base_name}.md"
pandoc --wrap=none \
--no-highlight \
--strip-comments \
--from=latex \
--to=commonmark_x+pipe_tables \
"$f" \
-o "$out_file"
# Replace non-breaking spaces
sed -i .bak 's/ / /g' "$out_file"
sed -i .bak 's/ / /g' "$out_file"
sed -i .bak 's/ / /g' "$out_file"
sed -i .bak 's/ / /g' "$out_file"
sed -i.bak -E 's/`\\cite`//g; s/<[^>]*>//g; s/\{[^}]*\}//g; s/\\cite\{[^}]*\}//g' "$out_file"
sed -i.bak -E '
s/`\\cite`//g; # Remove \cite commands inside backticks
s/::: //g; # Remove the leading ::: for content markers
s/\[//g; # Remove opening square bracket
s/\]//g; # Remove closing square bracket
' "$out_file"
# Remove .bak file
rm "$out_file.bak"
done
```
--------------------------------------------------------------------------------
/marker/schema/blocks/figure.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class Figure(Block):
block_type: BlockTypes = BlockTypes.Figure
description: str | None = None
html: str | None = None
block_description: str = "A chart or other image that contains data."
def assemble_html(
self, document, child_blocks, parent_structure, block_config=None
):
if self.html:
return super().handle_html_output(
document, child_blocks, parent_structure, block_config
)
child_ref_blocks = [
block
for block in child_blocks
if block.id.block_type == BlockTypes.Reference
]
html = super().assemble_html(
document, child_ref_blocks, parent_structure, block_config
)
if self.description:
html += f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>"
return html
```
--------------------------------------------------------------------------------
/marker/schema/blocks/equation.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class Equation(Block):
block_type: BlockTypes = BlockTypes.Equation
html: str | None = None
block_description: str = "A block math equation."
def assemble_html(
self, document, child_blocks, parent_structure=None, block_config=None
):
if self.html:
child_ref_blocks = [
block
for block in child_blocks
if block.id.block_type == BlockTypes.Reference
]
html_out = super().assemble_html(
document, child_ref_blocks, parent_structure, block_config
)
html_out += f"""<p block-type='{self.block_type}'>{self.html}</p>"""
return html_out
else:
template = super().assemble_html(
document, child_blocks, parent_structure, block_config
)
return f"<p block-type='{self.block_type}'>{template}</p>"
```
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/output-bug-report.md:
--------------------------------------------------------------------------------
```markdown
---
name: Output bug report
about: Create a report about poor output quality
title: "[BUG: Output]"
labels: 'bug: output'
assignees: ''
---
## 📝 Describe the Output Issue
A clear and concise description of the incorrect or unexpected output.
## 📄 Input Document
Attach the PDF or input file used.
## 📤 Current Output
Paste the Markdown or HTML that Marker generated:
````markdown
Paste output here
`````
## ✅ Expected Output
Describe or paste what you expected Marker to generate.
## ⚙️ Environment
Please fill in all relevant details:
* **Marker version**:
* **Surya version**:
* **Python version**:
* **PyTorch version**:
* **Transformers version**:
* **Operating System**:
## 📟 Command or Code Used
Paste the **exact bash command** or **Python code** you used to run Marker:
<details>
<summary>Click to expand</summary>
```bash
# or Python code block
your_command_here --with-flags
```
</details>
## 📎 Additional Context
Any other relevant info, configs, or assumptions.
```
--------------------------------------------------------------------------------
/tests/converters/test_table_converter.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.converters.table import TableConverter
from marker.renderers.markdown import MarkdownOutput
from marker.util import classes_to_strings
def _table_converter(config, model_dict, renderer, temp_pdf):
converter = TableConverter(
artifact_dict=model_dict,
processor_list=None,
renderer=classes_to_strings([renderer])[0],
config=config
)
markdown_output: MarkdownOutput = converter(temp_pdf.name)
markdown = markdown_output.markdown
assert len(markdown) > 0
assert "cyclic" in markdown
@pytest.mark.output_format("markdown")
@pytest.mark.config({"page_range": [5]})
def test_table_converter(config, model_dict, renderer, temp_doc):
_table_converter(config, model_dict, renderer, temp_doc)
@pytest.mark.output_format("markdown")
@pytest.mark.config({"page_range": [5], "force_ocr": True})
def test_table_converter_ocr(config, model_dict, renderer, temp_doc):
_table_converter(config, model_dict, renderer, temp_doc)
```
--------------------------------------------------------------------------------
/tests/builders/test_blank_page.py:
--------------------------------------------------------------------------------
```python
from surya.layout.schema import LayoutResult
from marker.builders.document import DocumentBuilder
from marker.builders.layout import LayoutBuilder
from marker.builders.line import LineBuilder
def test_blank_page(config, doc_provider, layout_model, ocr_error_model, detection_model):
layout_builder = LayoutBuilder(layout_model, config)
line_builder = LineBuilder(detection_model, ocr_error_model)
builder = DocumentBuilder(config)
document = builder.build_document(doc_provider)
layout_results = [LayoutResult(
bboxes=[],
image_bbox=p.polygon.bbox,
) for p in document.pages]
provider_lines = {p.page_id: [] for p in document.pages}
ocr_lines = {p.page_id: [] for p in document.pages}
layout_builder.add_blocks_to_pages(document.pages, layout_results)
line_builder.merge_blocks(document, provider_lines, ocr_lines)
assert all([isinstance(p.children, list) for p in document.pages])
assert all([isinstance(p.structure, list) for p in document.pages])
```
--------------------------------------------------------------------------------
/marker/schema/blocks/complexregion.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class ComplexRegion(Block):
block_type: BlockTypes = BlockTypes.ComplexRegion
html: str | None = None
block_description: str = "A complex region that can consist of multiple different types of blocks mixed with images. This block is chosen when it is difficult to categorize the region as a single block type."
def assemble_html(self, document, child_blocks, parent_structure, block_config):
if self.html:
child_ref_blocks = [
block
for block in child_blocks
if block.id.block_type == BlockTypes.Reference
]
html = super().assemble_html(
document, child_ref_blocks, parent_structure, block_config
)
return html + self.html
else:
template = super().assemble_html(
document, child_blocks, parent_structure, block_config
)
return f"<p>{template}</p>"
```
--------------------------------------------------------------------------------
/marker/schema/blocks/picture.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class Picture(Block):
block_type: BlockTypes = BlockTypes.Picture
description: str | None = None
block_description: str = "An image block that represents a picture."
html: str | None = None
def assemble_html(
self, document, child_blocks, parent_structure, block_config=None
):
if self.html:
return super().handle_html_output(
document, child_blocks, parent_structure, block_config
)
child_ref_blocks = [
block
for block in child_blocks
if block.id.block_type == BlockTypes.Reference
]
html = super().assemble_html(
document, child_ref_blocks, parent_structure, block_config
)
if self.description:
return (
html
+ f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>"
)
return html
```
--------------------------------------------------------------------------------
/.github/workflows/benchmarks.yml:
--------------------------------------------------------------------------------
```yaml
name: Integration test
on: [push]
env:
PYTHONIOENCODING: "utf-8"
jobs:
benchmark:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [t4_gpu, ubuntu-latest]
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install apt dependencies
run: |
sudo apt-get update
sudo apt-get install -y pandoc
- name: Install python dependencies
run: |
pip install poetry
poetry install --extras "full"
- name: Run benchmark test
run: |
poetry run python benchmarks/overall/overall.py --max_rows 5
poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/result.json --type marker
- name: Run table benchmark
run: |
poetry run python benchmarks/table/table.py --max_rows 5
poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table
```
--------------------------------------------------------------------------------
/marker/services/vertex.py:
--------------------------------------------------------------------------------
```python
from typing import Annotated
from google import genai
from marker.services.gemini import BaseGeminiService
class GoogleVertexService(BaseGeminiService):
vertex_project_id: Annotated[
str,
"Google Cloud Project ID for Vertex AI.",
] = None
vertex_location: Annotated[
str,
"Google Cloud Location for Vertex AI.",
] = "us-central1"
gemini_model_name: Annotated[
str,
"The name of the Google model to use for the service."
] = "gemini-2.0-flash-001"
vertex_dedicated: Annotated[
bool,
"Whether to use a dedicated Vertex AI instance."
] = False
def get_google_client(self, timeout: int):
http_options = {"timeout": timeout * 1000} # Convert to milliseconds
if self.vertex_dedicated:
http_options["headers"] = {"x-vertex-ai-llm-request-type": "dedicated"}
return genai.Client(
vertexai=True,
project=self.vertex_project_id,
location=self.vertex_location,
http_options=http_options,
)
```
--------------------------------------------------------------------------------
/marker/scripts/chunk_convert.sh:
--------------------------------------------------------------------------------
```bash
#!/bin/bash
trap 'pkill -P $$' SIGINT
# Check if NUM_DEVICES is set
if [[ -z "$NUM_DEVICES" ]]; then
echo "Please set the NUM_DEVICES environment variable."
exit 1
fi
if [[ -z "$NUM_WORKERS" ]]; then
echo "Please set the NUM_WORKERS environment variable."
exit 1
fi
# Get input folder and output folder from args
if [[ -z "$1" ]]; then
echo "Please provide an input folder."
exit 1
fi
if [[ -z "$2" ]]; then
echo "Please provide an output folder."
exit 1
fi
INPUT_FOLDER=$1
OUTPUT_FOLDER=$2
# Ensure output folder exists
mkdir -p "$OUTPUT_FOLDER"
# Loop from 0 to NUM_DEVICES and run the marker command in parallel
for (( i=0; i<$NUM_DEVICES; i++ )); do
DEVICE_NUM=$i
export DEVICE_NUM
export NUM_DEVICES
export NUM_WORKERS
echo "Running marker on GPU $DEVICE_NUM"
cmd="CUDA_VISIBLE_DEVICES=$DEVICE_NUM marker $INPUT_FOLDER --output_dir $OUTPUT_FOLDER --num_chunks $NUM_DEVICES --chunk_idx $DEVICE_NUM --workers $NUM_WORKERS"
eval $cmd &
sleep 5
done
# Wait for all background processes to finish
wait
```
--------------------------------------------------------------------------------
/marker/schema/blocks/__init__.py:
--------------------------------------------------------------------------------
```python
from __future__ import annotations
from marker.schema.blocks.base import Block, BlockId, BlockOutput
from marker.schema.blocks.caption import Caption
from marker.schema.blocks.code import Code
from marker.schema.blocks.figure import Figure
from marker.schema.blocks.footnote import Footnote
from marker.schema.blocks.form import Form
from marker.schema.blocks.equation import Equation
from marker.schema.blocks.handwriting import Handwriting
from marker.schema.blocks.inlinemath import InlineMath
from marker.schema.blocks.listitem import ListItem
from marker.schema.blocks.pagefooter import PageFooter
from marker.schema.blocks.pageheader import PageHeader
from marker.schema.blocks.picture import Picture
from marker.schema.blocks.sectionheader import SectionHeader
from marker.schema.blocks.table import Table
from marker.schema.blocks.text import Text
from marker.schema.blocks.toc import TableOfContents
from marker.schema.blocks.complexregion import ComplexRegion
from marker.schema.blocks.tablecell import TableCell
from marker.schema.blocks.reference import Reference
```
--------------------------------------------------------------------------------
/marker/schema/blocks/tablecell.py:
--------------------------------------------------------------------------------
```python
from typing import List
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class TableCell(Block):
block_type: BlockTypes = BlockTypes.TableCell
rowspan: int
colspan: int
row_id: int
col_id: int
is_header: bool
text_lines: List[str] | None = None
block_description: str = "A cell in a table."
@property
def text(self):
return "\n".join(self.text_lines)
def assemble_html(
self, document, child_blocks, parent_structure=None, block_config=None
):
add_cell_id = block_config and block_config.get("add_block_ids", False)
tag_cls = "th" if self.is_header else "td"
tag = f"<{tag_cls}"
if self.rowspan > 1:
tag += f" rowspan={self.rowspan}"
if self.colspan > 1:
tag += f" colspan={self.colspan}"
if add_cell_id:
tag += f' data-block-id="{self.id}"'
if self.text_lines is None:
self.text_lines = []
text = "<br>".join(self.text_lines)
return f"{tag}>{text}</{tag_cls}>"
```
--------------------------------------------------------------------------------
/benchmarks/verify_scores.py:
--------------------------------------------------------------------------------
```python
import json
import argparse
def verify_scores(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
raw_scores = [data["scores"][k] for k in data["scores"]]
marker_scores = [r["marker"]["heuristic"]["score"] for r in raw_scores]
marker_score = sum(marker_scores) / len(marker_scores)
if marker_score < 90:
raise ValueError("Marker score below 90")
def verify_table_scores(file_path):
with open(file_path, 'r') as file:
data = json.load(file)
avg = sum([r["marker_score"] for r in data["marker"]]) / len(data)
if avg < 0.7:
raise ValueError("Average score is below the required threshold of 0.7")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Verify benchmark scores")
parser.add_argument("file_path", type=str, help="Path to the json file")
parser.add_argument("--type", type=str, help="Type of file to verify", default="marker")
args = parser.parse_args()
if args.type == "marker":
verify_scores(args.file_path)
elif args.type == "table":
verify_table_scores(args.file_path)
```
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/breaking-bug-report.md:
--------------------------------------------------------------------------------
```markdown
---
name: Breaking bug report
about: Create a report about a breaking bug
title: "[BUG: Breaking]"
labels: 'bug: breaking'
assignees: ''
---
## 🧨 Describe the Bug
A clear and concise description of the breaking issue (e.g., crash, OOM, exception, etc).
## 📄 Input Document
Attach the PDF or input file that triggered the error.
## 📤 Output Trace / Stack Trace
Paste the **complete** stack trace or error output, if available.
<details>
<summary>Click to expand</summary>
```
Paste stack trace here
```
</details>
## ⚙️ Environment
Please fill in all relevant details:
- **Marker version**:
- **Surya version**:
- **Python version**:
- **PyTorch version**:
- **Transformers version**:
- **Operating System** (incl. container info if relevant):
## ✅ Expected Behavior
What did you expect Marker to do?
## 📟 Command or Code Used
Paste the **exact bash command** or **Python code** you used to run Marker:
<details>
<summary>Click to expand</summary>
```bash
# or Python code block
your_command_here --with-flags
```
</details>
## 📎 Additional Context
Any other context that might help us debug this (e.g., CLI options, working directory, runtime settings).
```
--------------------------------------------------------------------------------
/marker/models.py:
--------------------------------------------------------------------------------
```python
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = (
"1" # Transformers uses .isin for an op, which is not supported on MPS
)
from surya.foundation import FoundationPredictor
from surya.detection import DetectionPredictor
from surya.layout import LayoutPredictor
from surya.ocr_error import OCRErrorPredictor
from surya.recognition import RecognitionPredictor
from surya.table_rec import TableRecPredictor
from surya.settings import settings as surya_settings
def create_model_dict(
device=None, dtype=None, attention_implementation: str | None = None
) -> dict:
return {
"layout_model": LayoutPredictor(FoundationPredictor(checkpoint=surya_settings.LAYOUT_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)),
"recognition_model": RecognitionPredictor(FoundationPredictor(checkpoint=surya_settings.RECOGNITION_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)),
"table_rec_model": TableRecPredictor(device=device, dtype=dtype),
"detection_model": DetectionPredictor(device=device, dtype=dtype),
"ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype),
}
```
--------------------------------------------------------------------------------
/marker/extractors/__init__.py:
--------------------------------------------------------------------------------
```python
from typing import Annotated, Sequence
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.schema.groups import PageGroup
from PIL import Image
from marker.services import BaseService
from marker.util import assign_config
class BaseExtractor:
"""
An extractor that uses a provided service to extract structured data from documents.
"""
max_concurrency: Annotated[
int,
"The maximum number of concurrent requests to make to the Gemini model.",
] = 3
disable_tqdm: Annotated[
bool,
"Whether to disable the tqdm progress bar.",
] = False
def __init__(self, llm_service: BaseService, config=None):
assign_config(self, config)
self.llm_service = llm_service
def extract_image(
self,
document: Document,
page: PageGroup,
remove_blocks: Sequence[BlockTypes] | None = None,
highres: bool = False, # Default False to save tokens
) -> Image.Image:
return page.get_image(
document,
highres=highres,
remove_blocks=remove_blocks,
)
def __call__(self, document: Document, *args, **kwargs):
raise NotImplementedError
```
--------------------------------------------------------------------------------
/marker/scripts/file_to_s3.py:
--------------------------------------------------------------------------------
```python
import json
import shutil
import datetime
from pathlib import Path
import boto3
from huggingface_hub import snapshot_download
import click
S3_API_URL = "https://1afbe4656a6b40d982ab5e730a39f6b9.r2.cloudflarestorage.com"
@click.command(help="Uploads files to an S3 bucket")
@click.argument("filepath", type=str)
@click.argument("s3_path", type=str)
@click.option("--bucket_name", type=str, default="datalab")
@click.option("--access_key_id", type=str, default="<access_key_id>")
@click.option("--access_key_secret", type=str, default="<access_key_secret>")
def main(filepath: str, s3_path: str, bucket_name: str, access_key_id: str, access_key_secret: str):
filepath = Path(filepath)
# Upload the files to S3
s3_client = boto3.client(
's3',
endpoint_url=S3_API_URL,
aws_access_key_id=access_key_id,
aws_secret_access_key=access_key_secret,
region_name="enam"
)
s3_key = f"{s3_path}/{filepath.name}"
try:
s3_client.upload_file(
str(filepath),
bucket_name,
s3_key
)
except Exception as e:
print(f"Error uploading {filepath}: {str(e)}")
print(f"Uploaded files to {s3_path}")
if __name__ == "__main__":
main()
```
--------------------------------------------------------------------------------
/benchmarks/overall/methods/marker.py:
--------------------------------------------------------------------------------
```python
import os
import tempfile
import time
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
from marker.config.parser import ConfigParser
from marker.converters.pdf import PdfConverter
class MarkerMethod(BaseMethod):
model_dict: dict = None
use_llm: bool = False
def __call__(self, sample) -> BenchmarkResult:
pdf_bytes = sample["pdf"] # This is a single page PDF
parser = ConfigParser({
"page_range": "0",
"disable_tqdm": True,
"use_llm": self.use_llm,
"redo_inline_math": self.use_llm,
"llm_service": "marker.services.vertex.GoogleVertexService",
"vertex_project_id": os.getenv("VERTEX_PROJECT_ID"),
})
block_converter = PdfConverter(
artifact_dict=self.model_dict,
config=parser.generate_config_dict(),
llm_service=parser.get_llm_service()
)
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
f.write(pdf_bytes)
start = time.time()
rendered = block_converter(f.name)
total = time.time() - start
return {
"markdown": rendered.markdown,
"time": total
}
```
--------------------------------------------------------------------------------
/marker/utils/image.py:
--------------------------------------------------------------------------------
```python
from PIL import Image
import numpy as np
import cv2
from typing import List, Optional
def is_blank_image(image: Image.Image, polygon: Optional[List[List[int]]] = None) -> bool:
image = np.asarray(image)
if (
image is None
or image.size == 0
or image.shape[0] == 0
or image.shape[1] == 0
):
# Handle empty image case
return True
if polygon is not None:
rounded_polys = [[int(corner[0]), int(corner[1])] for corner in polygon]
if rounded_polys[0] == rounded_polys[1] and rounded_polys[2] == rounded_polys[3]:
return True
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
gray = cv2.GaussianBlur(gray, (7, 7), 0)
# Adaptive threshold (inverse for text as white)
binarized = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15
)
num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
binarized, connectivity=8
)
cleaned = np.zeros_like(binarized)
for i in range(1, num_labels): # skip background
cleaned[labels == i] = 255
kernel = np.ones((1, 5), np.uint8)
dilated = cv2.dilate(cleaned, kernel, iterations=3)
b = dilated / 255
return bool(b.sum() == 0)
```
--------------------------------------------------------------------------------
/marker/schema/blocks/text.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class Text(Block):
block_type: BlockTypes = BlockTypes.Text
has_continuation: bool = False
blockquote: bool = False
blockquote_level: int = 0
html: str | None = None
block_description: str = "A paragraph or line of text."
def assemble_html(
self, document, child_blocks, parent_structure, block_config=None
):
if self.ignore_for_output:
return ""
# This happens when we used an llm processor
if self.html:
return super().handle_html_output(
document, child_blocks, parent_structure, block_config
)
template = super().assemble_html(
document, child_blocks, parent_structure, block_config
)
template = template.replace("\n", " ")
el_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
el_attr += " class='has-continuation'"
if self.blockquote:
blockquote_prefix = "<blockquote>" * self.blockquote_level
blockquote_suffix = "</blockquote>" * self.blockquote_level
return f"{blockquote_prefix}<p{el_attr}>{template}</p>{blockquote_suffix}"
else:
return f"<p{el_attr}>{template}</p>"
```
--------------------------------------------------------------------------------
/.github/workflows/cla.yml:
--------------------------------------------------------------------------------
```yaml
name: "Marker CLA Assistant"
on:
issue_comment:
types: [created]
pull_request_target:
types: [opened,closed,synchronize]
# explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings
permissions:
actions: write
contents: write
pull-requests: write
statuses: write
jobs:
CLAAssistant:
runs-on: ubuntu-latest
steps:
- name: "Marker CLA Assistant"
if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target'
uses: contributor-assistant/[email protected]
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# the below token should have repo scope and must be manually added by you in the repository's secret
# This token is required only if you have configured to store the signatures in a remote repository/organization
PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
with:
path-to-signatures: 'signatures/version1/cla.json'
path-to-document: 'https://github.com/VikParuchuri/marker/blob/master/CLA.md'
# branch should not be protected
branch: 'master'
allowlist: VikParuchuri,Sandy
```
--------------------------------------------------------------------------------
/tests/builders/test_overriding.py:
--------------------------------------------------------------------------------
```python
import multiprocessing as mp
import pytest
from marker.providers.pdf import PdfProvider
from marker.schema import BlockTypes
from marker.schema.blocks import SectionHeader
from marker.schema.document import Document
from marker.schema.registry import register_block_class
from marker.schema.text import Line
from tests.utils import setup_pdf_provider
class NewSectionHeader(SectionHeader):
pass
class NewLine(Line):
pass
@pytest.mark.config({
"page_range": [0],
"override_map": {BlockTypes.SectionHeader: NewSectionHeader}
})
def test_overriding(pdf_document: Document):
assert pdf_document.pages[0]\
.get_block(pdf_document.pages[0].structure[0]).__class__ == NewSectionHeader
def get_lines(pdf: str, config=None):
for block_type, block_cls in config["override_map"].items():
register_block_class(block_type, block_cls)
provider: PdfProvider = setup_pdf_provider(pdf, config)
return provider.get_page_lines(0)
def test_overriding_mp():
config = {
"page_range": [0],
"override_map": {BlockTypes.Line: NewLine}
}
pdf_list = ["adversarial.pdf", "adversarial_rot.pdf"]
with mp.Pool(processes=2) as pool:
results = pool.starmap(get_lines, [(pdf, config) for pdf in pdf_list])
assert all([r[0].line.__class__ == NewLine for r in results])
```
--------------------------------------------------------------------------------
/.github/workflows/scripts.yml:
--------------------------------------------------------------------------------
```yaml
name: Test CLI scripts
on: [push]
jobs:
tests:
runs-on: t4_gpu
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install python dependencies
run: |
pip install poetry
poetry install --extras "full"
- name: Download benchmark data
run: |
wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
unzip -o benchmark_data.zip
- name: Test single script
run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0
- name: Test convert script
run: poetry run marker benchmark_data/pdfs --max_files 1 --page_range 0
- name: Text convert script multiple workers
run: poetry run marker benchmark_data/pdfs --max_files 2 --page_range 0-5
- name: Test llm option
run: |
poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 --use_llm > output.txt || echo "Command failed but continuing"
if ! grep -q "UserWarning" output.txt; then
echo "Success: No UserWarning found"
exit 0
else
echo "Error: UserWarning found in output"
exit 1
fi
```
--------------------------------------------------------------------------------
/marker/processors/footnote.py:
--------------------------------------------------------------------------------
```python
import re
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.schema.groups import PageGroup
class FootnoteProcessor(BaseProcessor):
"""
A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
"""
block_types = (BlockTypes.Footnote,)
def __call__(self, document: Document):
for page in document.pages:
self.push_footnotes_to_bottom(page, document)
self.assign_superscripts(page, document)
def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
footnote_blocks = page.contained_blocks(document, self.block_types)
# Push footnotes to the bottom
for block in footnote_blocks:
# Check if it is top-level
if block.id in page.structure:
# Move to bottom if it is
page.structure.remove(block.id)
page.add_structure(block)
def assign_superscripts(self, page: PageGroup, document: Document):
footnote_blocks = page.contained_blocks(document, self.block_types)
for block in footnote_blocks:
for span in block.contained_blocks(document, (BlockTypes.Span,)):
if re.match(r"^[0-9\W]+", span.text):
span.has_superscript = True
break
```
--------------------------------------------------------------------------------
/marker/scripts/convert_single.py:
--------------------------------------------------------------------------------
```python
import os
os.environ["GRPC_VERBOSITY"] = "ERROR"
os.environ["GLOG_minloglevel"] = "2"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = (
"1" # Transformers uses .isin for a simple op, which is not supported on MPS
)
import time
import click
from marker.config.parser import ConfigParser
from marker.config.printer import CustomClickPrinter
from marker.logger import configure_logging, get_logger
from marker.models import create_model_dict
from marker.output import save_output
configure_logging()
logger = get_logger()
@click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.")
@click.argument("fpath", type=str)
@ConfigParser.common_options
def convert_single_cli(fpath: str, **kwargs):
models = create_model_dict()
start = time.time()
config_parser = ConfigParser(kwargs)
converter_cls = config_parser.get_converter_cls()
converter = converter_cls(
config=config_parser.generate_config_dict(),
artifact_dict=models,
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer(),
llm_service=config_parser.get_llm_service(),
)
rendered = converter(fpath)
out_folder = config_parser.get_output_folder(fpath)
save_output(rendered, out_folder, config_parser.get_base_filename(fpath))
logger.info(f"Saved markdown to {out_folder}")
logger.info(f"Total time: {time.time() - start}")
```
--------------------------------------------------------------------------------
/marker/schema/blocks/inlinemath.py:
--------------------------------------------------------------------------------
```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block
class InlineMath(Block):
block_type: BlockTypes = BlockTypes.TextInlineMath
has_continuation: bool = False
blockquote: bool = False
blockquote_level: int = 0
block_description: str = "A text block that contains inline math. This is not used for italic text or references - only for text that contains math."
html: str | None = None
def assemble_html(
self, document, child_blocks, parent_structure, block_config=None
):
if self.ignore_for_output:
return ""
if self.html:
return super().handle_html_output(
document, child_blocks, parent_structure, block_config
)
template = super().assemble_html(
document, child_blocks, parent_structure, block_config
)
template = template.replace("\n", " ")
el_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
el_attr += " class='has-continuation'"
if self.blockquote:
# Add indentation for blockquote levels
blockquote_prefix = "<blockquote>" * self.blockquote_level
blockquote_suffix = "</blockquote>" * self.blockquote_level
return f"{blockquote_prefix}<p{el_attr}>{template}</p>{blockquote_suffix}"
else:
return f"<p{el_attr}>{template}</p>"
```
--------------------------------------------------------------------------------
/tests/builders/test_layout_replace.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.builders.document import DocumentBuilder
from marker.builders.layout import LayoutBuilder
from marker.builders.line import LineBuilder
from marker.renderers.markdown import MarkdownRenderer
from marker.schema import BlockTypes
from marker.schema.registry import get_block_class
@pytest.mark.filename("thinkpython.pdf")
@pytest.mark.config({"page_range": [0]})
def test_layout_replace(
request, config, doc_provider, layout_model, ocr_error_model, detection_model
):
# The llm layout builder replaces blocks - this makes sure text is still merged properly
layout_builder = LayoutBuilder(layout_model, config)
line_builder = LineBuilder(detection_model, ocr_error_model, config)
builder = DocumentBuilder(config)
document = builder.build_document(doc_provider)
layout_builder(document, doc_provider)
page = document.pages[0]
new_blocks = []
for block in page.contained_blocks(document, (BlockTypes.Text,)):
generated_block_class = get_block_class(BlockTypes.TextInlineMath)
generated_block = generated_block_class(
polygon=block.polygon,
page_id=block.page_id,
structure=block.structure,
)
page.replace_block(block, generated_block)
new_blocks.append(generated_block)
line_builder(document, doc_provider)
for block in new_blocks:
assert block.raw_text(document).strip()
renderer = MarkdownRenderer(config)
rendered = renderer(document)
assert "Think Python" in rendered.markdown
```
--------------------------------------------------------------------------------
/marker/processors/code.py:
--------------------------------------------------------------------------------
```python
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Code
from marker.schema.document import Document
class CodeProcessor(BaseProcessor):
"""
A processor for formatting code blocks.
"""
block_types = (BlockTypes.Code, )
def __call__(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
self.format_block(document, block)
def format_block(self, document: Document, block: Code):
min_left = 9999 # will contain x- coord of column 0
total_width = 0
total_chars = 0
contained_lines = block.contained_blocks(document, (BlockTypes.Line,))
for line in contained_lines:
min_left = min(line.polygon.bbox[0], min_left)
total_width += line.polygon.width
total_chars += len(line.raw_text(document))
avg_char_width = total_width / max(total_chars, 1)
code_text = ""
is_new_line = False
for line in contained_lines:
text = line.raw_text(document)
if avg_char_width == 0:
prefix = ""
else:
total_spaces = int((line.polygon.bbox[0] - min_left) / avg_char_width)
prefix = " " * max(0, total_spaces)
if is_new_line:
text = prefix + text
code_text += text
is_new_line = text.endswith("\n")
block.code = code_text.rstrip()
```
--------------------------------------------------------------------------------
/marker/settings.py:
--------------------------------------------------------------------------------
```python
from typing import Optional
from dotenv import find_dotenv
from pydantic import computed_field
from pydantic_settings import BaseSettings
import torch
import os
class Settings(BaseSettings):
# Paths
BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results")
FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")
ARTIFACT_URL: str = "https://models.datalab.to/artifacts"
FONT_NAME: str = "GoNotoCurrent-Regular.ttf"
FONT_PATH: str = os.path.join(FONT_DIR, FONT_NAME)
LOGLEVEL: str = "INFO"
# General
OUTPUT_ENCODING: str = "utf-8"
OUTPUT_IMAGE_FORMAT: str = "JPEG"
# LLM
GOOGLE_API_KEY: Optional[str] = ""
# General models
TORCH_DEVICE: Optional[str] = (
None # Note: MPS device does not work for text detection, and will default to CPU
)
@computed_field
@property
def TORCH_DEVICE_MODEL(self) -> str:
if self.TORCH_DEVICE is not None:
return self.TORCH_DEVICE
if torch.cuda.is_available():
return "cuda"
if torch.backends.mps.is_available():
return "mps"
return "cpu"
@computed_field
@property
def MODEL_DTYPE(self) -> torch.dtype:
if self.TORCH_DEVICE_MODEL == "cuda":
return torch.bfloat16
else:
return torch.float32
class Config:
env_file = find_dotenv("local.env")
extra = "ignore"
settings = Settings()
```
--------------------------------------------------------------------------------
/marker/converters/ocr.py:
--------------------------------------------------------------------------------
```python
from typing import Tuple
from marker.builders.document import DocumentBuilder
from marker.builders.line import LineBuilder
from marker.builders.ocr import OcrBuilder
from marker.converters.pdf import PdfConverter
from marker.processors import BaseProcessor
from marker.processors.equation import EquationProcessor
from marker.providers.registry import provider_from_filepath
from marker.renderers.ocr_json import OCRJSONRenderer
class OCRConverter(PdfConverter):
default_processors: Tuple[BaseProcessor, ...] = (EquationProcessor,)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if not self.config:
self.config = {}
self.config["force_ocr"] = True
self.renderer = OCRJSONRenderer
def build_document(self, filepath: str):
provider_cls = provider_from_filepath(filepath)
layout_builder = self.resolve_dependencies(self.layout_builder_class)
line_builder = self.resolve_dependencies(LineBuilder)
ocr_builder = self.resolve_dependencies(OcrBuilder)
document_builder = DocumentBuilder(self.config)
provider = provider_cls(filepath, self.config)
document = document_builder(provider, layout_builder, line_builder, ocr_builder)
for processor in self.processor_list:
processor(document)
return document
def __call__(self, filepath: str):
document = self.build_document(filepath)
self.page_count = len(document.pages)
renderer = self.resolve_dependencies(self.renderer)
return renderer(document)
```
--------------------------------------------------------------------------------
/tests/builders/test_document_builder.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.schema import BlockTypes
from marker.schema.text.line import Line
@pytest.mark.filename("thinkpython.pdf")
@pytest.mark.config({"page_range": [0]})
def test_document_builder(pdf_document):
first_page = pdf_document.pages[0]
assert first_page.structure[0] == "/page/0/SectionHeader/0"
first_block = first_page.get_block(first_page.structure[0])
assert first_block.block_type == BlockTypes.SectionHeader
assert first_block.text_extraction_method == "pdftext"
first_text_block: Line = first_page.get_block(first_block.structure[0])
assert first_text_block.block_type == BlockTypes.Line
first_span = first_page.get_block(first_text_block.structure[0])
assert first_span.block_type == BlockTypes.Span
assert first_span.text == "Think Python"
assert first_span.font == "URWPalladioL-Roma"
assert first_span.formats == ["plain"]
@pytest.mark.config({"page_range": [0]})
def test_document_builder_inline_eq(pdf_document):
first_page = pdf_document.pages[0]
assert first_page.structure[0] == "/page/0/SectionHeader/0"
first_block = first_page.get_block(first_page.structure[0])
assert first_block.block_type == BlockTypes.SectionHeader
assert first_block.text_extraction_method == "surya"
first_text_block: Line = first_page.get_block(first_block.structure[0])
assert first_text_block.block_type == BlockTypes.Line
first_span = first_page.get_block(first_text_block.structure[0])
assert first_span.block_type == BlockTypes.Span
assert first_span.text.strip() == "Subspace Adversarial Training"
assert "bold" in first_span.formats
```
--------------------------------------------------------------------------------
/marker/services/__init__.py:
--------------------------------------------------------------------------------
```python
from typing import Optional, List, Annotated
from io import BytesIO
import PIL
from pydantic import BaseModel
from marker.schema.blocks import Block
from marker.util import assign_config, verify_config_keys
import base64
class BaseService:
timeout: Annotated[int, "The timeout to use for the service."] = 30
max_retries: Annotated[
int, "The maximum number of retries to use for the service."
] = 2
retry_wait_time: Annotated[int, "The wait time between retries."] = 3
max_output_tokens: Annotated[
int, "The maximum number of output tokens to generate."
] = None
def img_to_base64(self, img: PIL.Image.Image, format: str = "WEBP"):
image_bytes = BytesIO()
img.save(image_bytes, format=format)
return base64.b64encode(image_bytes.getvalue()).decode("utf-8")
def process_images(self, images: List[PIL.Image.Image]) -> list:
raise NotImplementedError
def format_image_for_llm(self, image):
if not image:
return []
if not isinstance(image, list):
image = [image]
image_parts = self.process_images(image)
return image_parts
def __init__(self, config: Optional[BaseModel | dict] = None):
assign_config(self, config)
# Ensure we have all necessary fields filled out (API keys, etc.)
verify_config_keys(self)
def __call__(
self,
prompt: str,
image: PIL.Image.Image | List[PIL.Image.Image] | None,
block: Block | None,
response_schema: type[BaseModel],
max_retries: int | None = None,
timeout: int | None = None,
):
raise NotImplementedError
```
--------------------------------------------------------------------------------
/tests/providers/test_document_providers.py:
--------------------------------------------------------------------------------
```python
import pytest
@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("lambda.pptx")
def test_pptx_provider(doc_provider):
assert doc_provider.get_images([0], 72)[0].size == (842, 596)
page_lines = doc_provider.get_page_lines(0)
spans = page_lines[0].spans
assert spans[0].text == "Lambda Calculus"
spans = page_lines[1].spans
assert spans[0].text == "CSE 340 – Principles of Programming Languages"
@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("manual.epub")
def test_epub_provider(doc_provider):
assert doc_provider.get_images([0], 72)[0].size == (596, 842)
page_lines = doc_provider.get_page_lines(0)
spans = page_lines[0].spans
assert spans[0].text == "The Project Gutenberg eBook of Simple"
@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("china.html")
def test_html_provider(doc_provider):
assert doc_provider.get_images([0], 72)[0].size == (596, 842)
page_lines = doc_provider.get_page_lines(0)
spans = page_lines[0].spans
assert spans[0].text == "Jump to content"
@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("gatsby.docx")
def test_docx_provider(doc_provider):
assert doc_provider.get_images([0], 72)[0].size == (596, 842)
page_lines = doc_provider.get_page_lines(0)
spans = page_lines[0].spans
assert spans[0].text == "Themes"
@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("single_sheet.xlsx")
def test_xlsx_provider(doc_provider):
assert doc_provider.get_images([0], 72)[0].size == (842, 596)
page_lines = doc_provider.get_page_lines(0)
spans = page_lines[0].spans
assert spans[0].text == "Sheet1"
```
--------------------------------------------------------------------------------
/marker/schema/blocks/listitem.py:
--------------------------------------------------------------------------------
```python
import re
from marker.schema import BlockTypes
from marker.schema.blocks import Block
def replace_bullets(child_blocks):
# Replace bullet characters with a -
first_block = None
while len(child_blocks) > 0:
first_block = child_blocks[0]
child_blocks = first_block.children
if first_block is not None and first_block.id.block_type == BlockTypes.Line:
bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○ഠ ം◦■▪▫–—-]( )"
first_block.html = re.sub(bullet_pattern, r"\1\2", first_block.html)
class ListItem(Block):
block_type: BlockTypes = BlockTypes.ListItem
list_indent_level: int = 0
block_description: str = "A list item that is part of a list. This block is used to represent a single item in a list."
html: str | None = None
def assemble_html(
self, document, child_blocks, parent_structure, block_config=None
):
template = super().assemble_html(
document, child_blocks, parent_structure, block_config
)
template = template.replace("\n", " ")
# Remove the first bullet character
replace_bullets(child_blocks)
if self.html:
template = (
super()
.handle_html_output(
document, child_blocks, parent_structure, block_config
)
.strip()
)
template = template.replace("<li>", "").replace("</li>", "")
el_attr = f" block-type='{self.block_type}'"
if self.list_indent_level:
return f"<ul><li{el_attr} class='list-indent-{self.list_indent_level}'>{template}</li></ul>"
return f"<li{el_attr}>{template}</li>"
```
--------------------------------------------------------------------------------
/marker/providers/image.py:
--------------------------------------------------------------------------------
```python
from typing import List, Annotated
from PIL import Image
from marker.providers import ProviderPageLines, BaseProvider
from marker.schema.polygon import PolygonBox
from marker.schema.text import Line
from pdftext.schema import Reference
class ImageProvider(BaseProvider):
page_range: Annotated[
List[int],
"The range of pages to process.",
"Default is None, which will process all pages.",
] = None
image_count: int = 1
def __init__(self, filepath: str, config=None):
super().__init__(filepath, config)
self.images = [Image.open(filepath)]
self.page_lines: ProviderPageLines = {i: [] for i in range(self.image_count)}
if self.page_range is None:
self.page_range = range(self.image_count)
assert max(self.page_range) < self.image_count and min(self.page_range) >= 0, (
f"Invalid page range, values must be between 0 and {len(self.doc) - 1}. Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}."
)
self.page_bboxes = {
i: [0, 0, self.images[i].size[0], self.images[i].size[1]]
for i in self.page_range
}
def __len__(self):
return self.image_count
def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
return [self.images[i] for i in idxs]
def get_page_bbox(self, idx: int) -> PolygonBox | None:
bbox = self.page_bboxes[idx]
if bbox:
return PolygonBox.from_bbox(bbox)
def get_page_lines(self, idx: int) -> List[Line]:
return self.page_lines[idx]
def get_page_refs(self, idx: int) -> List[Reference]:
return []
```
--------------------------------------------------------------------------------
/tests/builders/test_ocr_pipeline.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.schema import BlockTypes
from marker.schema.text.line import Line
def _ocr_pipeline_test(pdf_document):
first_page = pdf_document.pages[0]
assert first_page.structure[0] == "/page/0/SectionHeader/0"
first_block = first_page.get_block(first_page.structure[0])
assert first_block.text_extraction_method == "surya"
assert first_block.block_type == BlockTypes.SectionHeader
first_text_block: Line = first_page.get_block(first_block.structure[0])
assert first_text_block.block_type == BlockTypes.Line
first_span = first_page.get_block(first_text_block.structure[0])
assert first_span.block_type == BlockTypes.Span
assert first_span.text.strip() == "Subspace Adversarial Training"
# Ensure we match all text lines up properly
# Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
text_blocks = first_page.contained_blocks(
pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath)
)
# assert len(text_lines) == 83
# Ensure the bbox sizes match up
max_line_position = max([line.polygon.y_end for line in text_lines])
max_block_position = max(
[block.polygon.y_end for block in text_blocks if block.source == "layout"]
)
assert max_line_position <= (max_block_position * 1.02)
@pytest.mark.config({"force_ocr": True, "page_range": [0]})
def test_ocr_pipeline(pdf_document):
_ocr_pipeline_test(pdf_document)
@pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True})
def test_ocr_with_inline_pipeline(pdf_document):
_ocr_pipeline_test(pdf_document)
```
--------------------------------------------------------------------------------
/benchmarks/table/gemini.py:
--------------------------------------------------------------------------------
```python
import json
from PIL import Image
from google import genai
from google.genai import types
from io import BytesIO
from pydantic import BaseModel
from marker.settings import settings
prompt = """
You're an expert document analyst who is good at turning tables in documents into HTML. Analyze the provided image, and convert it to a faithful HTML representation.
Guidelines:
- Keep the HTML simple and concise.
- Only include the <table> tag and contents.
- Only use <table>, <tr>, and <td> tags. Only use the colspan and rowspan attributes if necessary. Do not use <tbody>, <thead>, or <th> tags.
- Make sure the table is as faithful to the image as possible with the given tags.
**Instructions**
1. Analyze the image, and determine the table structure.
2. Convert the table image to HTML, following the guidelines above.
3. Output only the HTML for the table, starting with the <table> tag and ending with the </table> tag.
""".strip()
class TableSchema(BaseModel):
table_html: str
def gemini_table_rec(image: Image.Image):
client = genai.Client(
api_key=settings.GOOGLE_API_KEY,
http_options={"timeout": 60000}
)
image_bytes = BytesIO()
image.save(image_bytes, format="PNG")
responses = client.models.generate_content(
model="gemini-2.0-flash",
contents=[types.Part.from_bytes(data=image_bytes.getvalue(), mime_type="image/png"), prompt], # According to gemini docs, it performs better if the image is the first element
config={
"temperature": 0,
"response_schema": TableSchema,
"response_mime_type": "application/json",
},
)
output = responses.candidates[0].content.parts[0].text
return json.loads(output)["table_html"]
```
--------------------------------------------------------------------------------
/tests/builders/test_pdf_links.py:
--------------------------------------------------------------------------------
```python
import re
import pytest
from marker.converters.pdf import PdfConverter
from marker.renderers.markdown import MarkdownOutput
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.util import classes_to_strings
@pytest.mark.filename("arxiv_test.pdf")
@pytest.mark.output_format("markdown")
@pytest.mark.config({"disable_ocr": True})
def test_pdf_links(pdf_document: Document, config, renderer, model_dict, temp_doc):
first_page = pdf_document.pages[1]
processors = ["marker.processors.reference.ReferenceProcessor"]
pdf_converter = PdfConverter(
artifact_dict=model_dict,
processor_list=processors,
renderer=classes_to_strings([renderer])[0],
config=config,
)
for section_header_span in first_page.contained_blocks(
pdf_document, (BlockTypes.Span,)
):
if "II." in section_header_span.text:
assert section_header_span.url == "#page-1-0"
break
else:
raise ValueError("Could not find II. in the first page")
section_header_block = first_page.contained_blocks(
pdf_document, (BlockTypes.SectionHeader,)
)[0]
assert section_header_block.raw_text(pdf_document) == "II. THEORETICAL FRAMEWORK\n"
assert first_page.refs[0].ref == "page-1-0"
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
markdown = markdown_output.markdown
assert "[II.](#page-1-0)" in markdown
assert '<span id="page-1-0"></span>II. THEORETICAL FRAMEWORK' in markdown
for ref in set(
[
f'<span id="page-{m[0]}-{m[1]}">'
for m in re.findall(r"\]\(#page-(\d+)-(\d+)\)", markdown)
]
):
assert ref in markdown, f"Reference {ref} not found in markdown"
```
--------------------------------------------------------------------------------
/benchmarks/overall/download/llamaparse.py:
--------------------------------------------------------------------------------
```python
import io
import time
import requests
from benchmarks.overall.download.base import Downloader
class LlamaParseDownloader(Downloader):
service = "llamaparse"
def get_html(self, pdf_bytes):
rand_name = str(time.time()) + ".pdf"
start = time.time()
buff = io.BytesIO(pdf_bytes)
md = upload_and_parse_file(self.api_key, rand_name, buff)
end = time.time()
if isinstance(md, bytes):
md = md.decode("utf-8")
return {
"md": md,
"time": end - start,
}
def upload_and_parse_file(api_key: str, fname: str, buff, max_retries: int = 180, delay: int = 1):
headers = {
"Authorization": f"Bearer {api_key}",
"Accept": "application/json"
}
# Upload file
files = {
'file': (fname, buff, 'application/pdf')
}
response = requests.post(
'https://api.cloud.llamaindex.ai/api/v1/parsing/upload',
headers=headers,
files=files
)
response.raise_for_status()
job_id = response.json()['id']
# Poll for completion
for _ in range(max_retries):
status_response = requests.get(
f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}',
headers=headers
)
status_response.raise_for_status()
if status_response.json()['status'] == 'SUCCESS':
# Get results
result_response = requests.get(
f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown',
headers=headers
)
result_response.raise_for_status()
return result_response.json()['markdown']
time.sleep(delay)
raise TimeoutError("Job did not complete within the maximum retry attempts")
```
--------------------------------------------------------------------------------
/benchmarks/overall/download/mistral.py:
--------------------------------------------------------------------------------
```python
import io
import time
import requests
from benchmarks.overall.download.base import Downloader
class MistralDownloader(Downloader):
service = "mistral"
def get_html(self, pdf_bytes):
rand_name = str(time.time()) + ".pdf"
start = time.time()
buff = io.BytesIO(pdf_bytes)
md = upload_and_process_file(self.api_key, rand_name, buff)
end = time.time()
if isinstance(md, bytes):
md = md.decode("utf-8")
return {
"md": md,
"time": end - start,
}
def upload_and_process_file(api_key: str, fname: str, buff):
headers = {
"Authorization": f"Bearer {api_key}"
}
upload_headers = headers.copy()
files = {
'file': (fname, buff, 'application/pdf'),
'purpose': (None, 'ocr')
}
upload_response = requests.post(
'https://api.mistral.ai/v1/files',
headers=upload_headers,
files=files
)
upload_response.raise_for_status()
file_id = upload_response.json()['id']
url_headers = headers.copy()
url_headers["Accept"] = "application/json"
url_response = requests.get(
f'https://api.mistral.ai/v1/files/{file_id}/url?expiry=24',
headers=url_headers
)
url_response.raise_for_status()
signed_url = url_response.json()['url']
ocr_headers = headers.copy()
ocr_headers["Content-Type"] = "application/json"
ocr_data = {
"model": "mistral-ocr-latest",
"document": {
"type": "document_url",
"document_url": signed_url
},
"include_image_base64": True
}
ocr_response = requests.post(
'https://api.mistral.ai/v1/ocr',
headers=ocr_headers,
json=ocr_data
)
ocr_response.raise_for_status()
result = ocr_response.json()
return result["pages"][0]["markdown"]
```
--------------------------------------------------------------------------------
/tests/config/test_config.py:
--------------------------------------------------------------------------------
```python
import sys
from contextlib import suppress
import click
from marker.config.printer import CustomClickPrinter
from marker.config.crawler import crawler
from marker.config.parser import ConfigParser
def capture_kwargs(argv):
command = click.command(cls=CustomClickPrinter)
captured_kwargs = {}
def parse_args(**kwargs):
captured_kwargs.update(kwargs)
return kwargs
original_argv = sys.argv
sys.argv = argv
try:
with suppress(SystemExit):
command(ConfigParser.common_options(parse_args))()
finally:
sys.argv = original_argv
return captured_kwargs
def test_config_parser():
sys.argv = [
"test",
"--disable_multiprocessing",
"--output_dir",
"output_dir",
"--height_tolerance",
"0.5",
]
kwargs = capture_kwargs(sys.argv)
parser = ConfigParser(kwargs)
config_dict = parser.generate_config_dict()
# Validate kwarg capturing
assert kwargs["disable_multiprocessing"]
assert kwargs["output_dir"] == "output_dir"
assert config_dict["pdftext_workers"] == 1 # disabling multiprocessing does this
assert config_dict["height_tolerance"] == 0.5
def test_config_none():
kwargs = capture_kwargs(["test"])
for key in crawler.attr_set:
# We force some options to become flags for ease of use on the CLI
value = None
assert kwargs.get(key) is value
def test_config_llm():
kwargs = capture_kwargs(["test", "--use_llm"])
parser = ConfigParser(kwargs)
config_dict = parser.generate_config_dict()
# Validate kwarg capturing
assert config_dict["use_llm"]
def test_config_force_ocr():
kwargs = capture_kwargs(["test", "--force_ocr"])
parser = ConfigParser(kwargs)
config_dict = parser.generate_config_dict()
# Validate kwarg capturing
assert config_dict["force_ocr"]
```
--------------------------------------------------------------------------------
/tests/converters/test_ocr_converter.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.converters.ocr import OCRConverter
from marker.renderers.ocr_json import OCRJSONOutput, OCRJSONPageOutput
def _ocr_converter(config, model_dict, temp_pdf, line_count: int, eq_count: int):
converter = OCRConverter(artifact_dict=model_dict, config=config)
ocr_json: OCRJSONOutput = converter(temp_pdf.name)
pages = ocr_json.children
assert len(pages) == 1
# assert len(pages[0].children) == line_count
eqs = [line for line in pages[0].children if line.block_type == "Equation"]
assert len(eqs) == eq_count
return pages
def check_bboxes(page: OCRJSONPageOutput, lines):
page_size = page.bbox
for line in lines:
assert len(line.children) > 0
for child in line.children:
bbox = child.bbox
assert all(
[
bbox[0] >= page_size[0],
bbox[1] >= page_size[1],
bbox[2] <= page_size[2],
bbox[3] <= page_size[3],
]
), "Child bbox is outside page bbox"
@pytest.mark.config({"page_range": [0]})
def test_ocr_converter(config, model_dict, temp_doc):
_ocr_converter(config, model_dict, temp_doc, 85, 2)
@pytest.mark.filename("pres.pdf")
@pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True})
def test_ocr_converter_force(config, model_dict, temp_doc):
pages = _ocr_converter(config, model_dict, temp_doc, 10, 0)
lines = [line for line in pages[0].children if line.block_type == "Line"]
check_bboxes(pages[0], lines)
@pytest.mark.filename("pres.pdf")
@pytest.mark.config({"page_range": [1], "keep_chars": True})
def test_ocr_converter_keep(config, model_dict, temp_doc):
pages = _ocr_converter(config, model_dict, temp_doc, 10, 0)
lines = [line for line in pages[0].children if line.block_type == "Line"]
check_bboxes(pages[0], lines)
```
--------------------------------------------------------------------------------
/benchmarks/overall/display/dataset.py:
--------------------------------------------------------------------------------
```python
import json
from typing import List
import datasets
from tqdm import tqdm
from benchmarks.overall.registry import METHOD_REGISTRY
from benchmarks.overall.schema import FullResult
def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, score_types: List[str], max_rows: int | None = None) -> datasets.Dataset:
rows = []
for idx, sample in tqdm(enumerate(bench_dataset), desc="Building dataset"):
if idx not in result["markdown"]:
continue
if max_rows is not None and idx >= max_rows:
break
row = {
"uuid": sample["uuid"],
"classification": sample["classification"],
"language": sample["language"],
"img": sample["img"],
}
for method in result["markdown"][idx]:
if method == "gt":
continue
method_cls = METHOD_REGISTRY[method]()
md = result["markdown"][idx][method]
try:
method_img = method_cls.render(result["markdown"][idx][method])
except Exception as e:
# This can happen when the markdown is None
method_img = PIL.Image.new("RGB", (200, 200))
row[f"{method}_md"] = md
row[f"{method}_img"] = method_img
for score_type in score_types:
try:
row[f"{method}_{score_type}"] = result["scores"][idx][method][score_type]["score"]
except KeyError:
row[f"{method}_{score_type}"] = -1.0 # Missing score
try:
row[f"{method}_{score_type}_detail"] = json.dumps(result["scores"][idx][method][score_type]["specific_scores"])
except KeyError:
row[f"{method}_{score_type}_detail"] = "" # Missing detail
rows.append(row)
ds = datasets.Dataset.from_list(rows)
return ds
```
--------------------------------------------------------------------------------
/benchmarks/overall/download/base.py:
--------------------------------------------------------------------------------
```python
import json
from json import JSONDecodeError
from pathlib import Path
import datasets
from tqdm import tqdm
class Downloader:
cache_path: Path = Path("cache")
service: str
def __init__(self, api_key, app_id, max_rows: int = 2200):
self.cache_path.mkdir(exist_ok=True)
self.max_rows = max_rows
self.api_key = api_key
self.app_id = app_id
self.ds = datasets.load_dataset("datalab-to/marker_benchmark", split="train")
def get_html(self, pdf_bytes):
raise NotImplementedError
def upload_ds(self):
rows = []
for file in self.cache_path.glob("*.json"):
with open(file, "r") as f:
data = json.load(f)
rows.append(data)
out_ds = datasets.Dataset.from_list(rows, features=datasets.Features({
"md": datasets.Value("string"),
"uuid": datasets.Value("string"),
"time": datasets.Value("float"),
}))
out_ds.push_to_hub(f"datalab-to/marker_benchmark_{self.service}", private=True)
def generate_data(self):
max_rows = self.max_rows
for idx, sample in tqdm(enumerate(self.ds), desc=f"Saving {self.service} results"):
cache_file = self.cache_path / f"{idx}.json"
if cache_file.exists():
continue
pdf_bytes = sample["pdf"] # This is a single page PDF
try:
out_data = self.get_html(pdf_bytes)
except JSONDecodeError as e:
print(f"Error with sample {idx}: {e}")
continue
except Exception as e:
print(f"Error with sample {idx}: {e}")
continue
out_data["uuid"] = sample["uuid"]
with cache_file.open("w") as f:
json.dump(out_data, f)
if idx >= max_rows:
break
def __call__(self):
self.generate_data()
self.upload_ds()
```
--------------------------------------------------------------------------------
/marker/processors/reference.py:
--------------------------------------------------------------------------------
```python
import numpy as np
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Reference
from marker.schema.document import Document
from marker.schema.groups.list import ListGroup
from marker.schema.groups.table import TableGroup
from marker.schema.registry import get_block_class
from marker.schema.groups.figure import FigureGroup
class ReferenceProcessor(BaseProcessor):
"""
A processor for adding references to the document.
"""
def __init__(self, config):
super().__init__(config)
def __call__(self, document: Document):
ReferenceClass: Reference = get_block_class(BlockTypes.Reference)
for page in document.pages:
refs = page.refs
ref_starts = np.array([ref.coord for ref in refs])
blocks = []
for block_id in page.structure:
block = page.get_block(block_id)
if isinstance(block, (ListGroup, FigureGroup, TableGroup)):
blocks.extend([page.get_block(b) for b in block.structure])
else:
blocks.append(block)
blocks = [b for b in blocks if not b.ignore_for_output]
block_starts = np.array([block.polygon.bbox[:2] for block in blocks])
if not (len(refs) and len(block_starts)):
continue
distances = np.linalg.norm(block_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2)
for ref_idx in range(len(ref_starts)):
block_idx = np.argmin(distances[:, ref_idx])
block = blocks[block_idx]
ref_block = page.add_full_block(ReferenceClass(
ref=refs[ref_idx].ref,
polygon=block.polygon,
page_id=page.page_id
))
if block.structure is None:
block.structure = []
block.structure.insert(0, ref_block.id)
```
--------------------------------------------------------------------------------
/marker/builders/document.py:
--------------------------------------------------------------------------------
```python
from typing import Annotated
from marker.builders import BaseBuilder
from marker.builders.layout import LayoutBuilder
from marker.builders.line import LineBuilder
from marker.builders.ocr import OcrBuilder
from marker.providers.pdf import PdfProvider
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.schema.groups.page import PageGroup
from marker.schema.registry import get_block_class
class DocumentBuilder(BaseBuilder):
"""
Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder.
"""
lowres_image_dpi: Annotated[
int,
"DPI setting for low-resolution page images used for Layout and Line Detection.",
] = 96
highres_image_dpi: Annotated[
int,
"DPI setting for high-resolution page images used for OCR.",
] = 192
disable_ocr: Annotated[
bool,
"Disable OCR processing.",
] = False
def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, line_builder: LineBuilder, ocr_builder: OcrBuilder):
document = self.build_document(provider)
layout_builder(document, provider)
line_builder(document, provider)
if not self.disable_ocr:
ocr_builder(document, provider)
return document
def build_document(self, provider: PdfProvider):
PageGroupClass: PageGroup = get_block_class(BlockTypes.Page)
lowres_images = provider.get_images(provider.page_range, self.lowres_image_dpi)
highres_images = provider.get_images(provider.page_range, self.highres_image_dpi)
initial_pages = [
PageGroupClass(
page_id=p,
lowres_image=lowres_images[i],
highres_image=highres_images[i],
polygon=provider.get_page_bbox(p),
refs=provider.get_page_refs(p)
) for i, p in enumerate(provider.page_range)
]
DocumentClass: Document = get_block_class(BlockTypes.Document)
return DocumentClass(filepath=provider.filepath, pages=initial_pages)
```
--------------------------------------------------------------------------------
/marker/services/ollama.py:
--------------------------------------------------------------------------------
```python
import json
from typing import Annotated, List
import PIL
import requests
from marker.logger import get_logger
from pydantic import BaseModel
from marker.schema.blocks import Block
from marker.services import BaseService
logger = get_logger()
class OllamaService(BaseService):
ollama_base_url: Annotated[
str, "The base url to use for ollama. No trailing slash."
] = "http://localhost:11434"
ollama_model: Annotated[str, "The model name to use for ollama."] = (
"llama3.2-vision"
)
def process_images(self, images):
image_bytes = [self.img_to_base64(img) for img in images]
return image_bytes
def __call__(
self,
prompt: str,
image: PIL.Image.Image | List[PIL.Image.Image] | None,
block: Block | None,
response_schema: type[BaseModel],
max_retries: int | None = None,
timeout: int | None = None,
):
url = f"{self.ollama_base_url}/api/generate"
headers = {"Content-Type": "application/json"}
schema = response_schema.model_json_schema()
format_schema = {
"type": "object",
"properties": schema["properties"],
"required": schema["required"],
}
image_bytes = self.format_image_for_llm(image)
payload = {
"model": self.ollama_model,
"prompt": prompt,
"stream": False,
"format": format_schema,
"images": image_bytes,
}
try:
response = requests.post(url, json=payload, headers=headers)
response.raise_for_status()
response_data = response.json()
total_tokens = (
response_data["prompt_eval_count"] + response_data["eval_count"]
)
if block:
block.update_metadata(llm_request_count=1, llm_tokens_used=total_tokens)
data = response_data["response"]
return json.loads(data)
except Exception as e:
logger.warning(f"Ollama inference failed: {e}")
return {}
```
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
```toml
[tool.poetry]
name = "marker-pdf"
version = "1.10.1"
description = "Convert documents to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
license = "GPL-3.0-or-later"
repository = "https://github.com/VikParuchuri/marker"
keywords = ["pdf", "markdown", "ocr", "nlp"]
packages = [
{include = "marker"}
]
include = [
"marker/scripts/*.sh",
"marker/scripts/*.html",
]
[tool.poetry.dependencies]
python = "^3.10"
Pillow = "^10.1.0"
pydantic = "^2.4.2"
pydantic-settings = "^2.0.3"
transformers = "^4.45.2"
python-dotenv = "^1.0.0"
torch = "^2.7.0"
tqdm = "^4.66.1"
ftfy = "^6.1.1"
rapidfuzz = "^3.8.1"
surya-ocr = "^0.17.0"
regex = "^2024.4.28"
pdftext = "~0.6.3"
markdownify = "^1.1.0"
click = "^8.2.0"
markdown2 = "^2.5.2"
filetype = "^1.2.0"
google-genai = "^1.0.0"
anthropic = "^0.46.0"
pre-commit = "^4.2.0"
scikit-learn = "^1.6.1"
# Optional dependencies for documents
mammoth = {version = "^1.9.0", optional = true}
openpyxl = {version = "^3.1.5", optional = true}
python-pptx = {version = "^1.0.2", optional = true}
ebooklib = {version = "^0.18", optional = true}
weasyprint = {version = "^63.1", optional = true}
openai = "^1.65.2"
[tool.poetry.group.dev.dependencies]
jupyter = "^1.0.0"
datasets = "^2.21.0"
streamlit = "^1.37.1"
fastapi = "^0.115.4"
uvicorn = "^0.32.0"
python-multipart = "^0.0.16"
pytest = "^8.3.3"
pytest-mock = "^3.14.0"
apted = "1.0.3"
distance = "0.1.3"
lxml = "5.3.0"
tabulate = "^0.9.0"
latex2mathml = "^3.77.0"
playwright = "^1.49.1"
[tool.poetry.extras]
full = ["mammoth", "openpyxl", "python-pptx", "ebooklib", "weasyprint"]
[tool.poetry.scripts]
marker = "marker.scripts.convert:convert_cli"
marker_single = "marker.scripts.convert_single:convert_single_cli"
marker_chunk_convert = "marker.scripts.chunk_convert:chunk_convert_cli"
marker_gui = "marker.scripts.run_streamlit_app:streamlit_app_cli"
marker_extract = "marker.scripts.run_streamlit_app:extraction_app_cli"
marker_server = "marker.scripts.server:server_cli"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
```
--------------------------------------------------------------------------------
/marker/converters/table.py:
--------------------------------------------------------------------------------
```python
from typing import Tuple, List
from marker.builders.document import DocumentBuilder
from marker.builders.line import LineBuilder
from marker.builders.ocr import OcrBuilder
from marker.converters.pdf import PdfConverter
from marker.processors import BaseProcessor
from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
from marker.processors.llm.llm_form import LLMFormProcessor
from marker.processors.llm.llm_table import LLMTableProcessor
from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
from marker.processors.table import TableProcessor
from marker.providers.registry import provider_from_filepath
from marker.schema import BlockTypes
class TableConverter(PdfConverter):
default_processors: Tuple[BaseProcessor, ...] = (
TableProcessor,
LLMTableProcessor,
LLMTableMergeProcessor,
LLMFormProcessor,
LLMComplexRegionProcessor,
)
converter_block_types: List[BlockTypes] = (
BlockTypes.Table,
BlockTypes.Form,
BlockTypes.TableOfContents,
)
def build_document(self, filepath: str):
provider_cls = provider_from_filepath(filepath)
layout_builder = self.resolve_dependencies(self.layout_builder_class)
line_builder = self.resolve_dependencies(LineBuilder)
ocr_builder = self.resolve_dependencies(OcrBuilder)
document_builder = DocumentBuilder(self.config)
document_builder.disable_ocr = True
provider = provider_cls(filepath, self.config)
document = document_builder(provider, layout_builder, line_builder, ocr_builder)
for page in document.pages:
page.structure = [
p for p in page.structure if p.block_type in self.converter_block_types
]
for processor in self.processor_list:
processor(document)
return document
def __call__(self, filepath: str):
document = self.build_document(filepath)
self.page_count = len(document.pages)
renderer = self.resolve_dependencies(self.renderer)
return renderer(document)
```
--------------------------------------------------------------------------------
/marker/schema/blocks/basetable.py:
--------------------------------------------------------------------------------
```python
from typing import List
from marker.schema import BlockTypes
from marker.schema.blocks import Block, BlockOutput
from marker.schema.blocks.tablecell import TableCell
class BaseTable(Block):
block_type: BlockTypes | None = None
html: str | None = None
@staticmethod
def format_cells(
document, child_blocks, block_config, child_cells: List[TableCell] | None = None
):
if child_cells is None:
child_cells: List[TableCell] = [
document.get_block(c.id)
for c in child_blocks
if c.id.block_type == BlockTypes.TableCell
]
unique_rows = sorted(list(set([c.row_id for c in child_cells])))
html_repr = "<table><tbody>"
for row_id in unique_rows:
row_cells = sorted(
[c for c in child_cells if c.row_id == row_id], key=lambda x: x.col_id
)
html_repr += "<tr>"
for cell in row_cells:
html_repr += cell.assemble_html(
document, child_blocks, None, block_config
)
html_repr += "</tr>"
html_repr += "</tbody></table>"
return html_repr
def assemble_html(
self,
document,
child_blocks: List[BlockOutput],
parent_structure=None,
block_config: dict | None = None,
):
# Filter out the table cells, so they don't render twice
child_ref_blocks = [
block
for block in child_blocks
if block.id.block_type == BlockTypes.Reference
]
template = super().assemble_html(
document, child_ref_blocks, parent_structure, block_config
)
child_block_types = set([c.id.block_type for c in child_blocks])
if self.html:
# LLM processor
return template + self.html
elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types:
# Table processor
return template + self.format_cells(document, child_blocks, block_config)
else:
# Default text lines and spans
return f"<p>{template}</p>"
```
--------------------------------------------------------------------------------
/benchmarks/overall/download/mathpix.py:
--------------------------------------------------------------------------------
```python
import json
import time
import requests
from benchmarks.overall.download.base import Downloader
class MathpixDownloader(Downloader):
service = "mathpix"
def get_html(self, pdf_bytes):
headers = {
"app_id": self.app_id,
"app_key": self.api_key,
}
start = time.time()
pdf_id = mathpix_request(pdf_bytes, headers)
status = mathpix_status(pdf_id, headers)
if status in ["processing", "error"]:
md = ""
else:
md = mathpix_results(pdf_id, headers)
end = time.time()
if isinstance(md, bytes):
md = md.decode("utf-8")
return {
"md": md,
"time": end - start
}
def mathpix_request(buffer, headers):
response = requests.post("https://api.mathpix.com/v3/pdf",
headers=headers,
data={
"options_json": json.dumps(
{
"conversion_formats": {
"md": True,
"html": True
}
}
)
},
files={
"file": buffer
}
)
data = response.json()
pdf_id = data["pdf_id"]
return pdf_id
def mathpix_status(pdf_id, headers):
max_iters = 120
i = 0
status = "processing"
status2 = "processing"
while i < max_iters:
time.sleep(1)
response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}",
headers=headers
)
status_resp = response.json()
if "conversion_status" not in status_resp:
continue
status = status_resp["conversion_status"]["md"]["status"]
status2 = status_resp["conversion_status"]["html"]["status"]
if status == "completed" and status2 == "completed":
break
elif status == "error" or status2 == "error":
break
out_status = "completed" if status == "completed" and status2 == "completed" else "error"
return out_status
def mathpix_results(pdf_id, headers, ext="md"):
response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}.{ext}",
headers=headers
)
return response.content
```
--------------------------------------------------------------------------------
/tests/builders/test_garbled_pdf.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.builders.document import DocumentBuilder
from marker.builders.line import LineBuilder
from marker.processors.table import TableProcessor
from marker.schema import BlockTypes
@pytest.mark.filename("water_damage.pdf")
def test_garbled_pdf(pdf_document, recognition_model, table_rec_model, detection_model):
assert pdf_document.pages[0].structure[0] == "/page/0/Table/0"
table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
assert table_block.block_type == BlockTypes.Table
assert table_block.structure[0] == "/page/0/Line/10"
table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
assert table_cell.block_type == BlockTypes.Line
# We don't OCR in the initial pass, only with the TableProcessor
processor = TableProcessor(recognition_model, table_rec_model, detection_model)
processor(pdf_document)
table = pdf_document.pages[0].contained_blocks(pdf_document, (BlockTypes.Table,))[0]
assert "варіант" in table.raw_text(pdf_document)
table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
assert table_cell.block_type == BlockTypes.TableCell
@pytest.mark.filename("hindi_judgement.pdf")
@pytest.mark.config({"page_range": [2, 3], "disable_ocr": True})
def test_garbled_builder(config, doc_provider, detection_model, ocr_error_model):
line_builder = LineBuilder(detection_model, ocr_error_model, config)
builder = DocumentBuilder(config)
document = builder.build_document(doc_provider)
bad_ocr_results = line_builder.ocr_error_detection(
document.pages, doc_provider.page_lines
)
assert len(bad_ocr_results.labels) == 2
assert any([label == "bad" for label in bad_ocr_results.labels])
@pytest.mark.filename("adversarial.pdf")
@pytest.mark.config({"page_range": [2, 3], "disable_ocr": True})
def test_nongarbled_builder(config, doc_provider, detection_model, ocr_error_model):
line_builder = LineBuilder(detection_model, ocr_error_model, config)
builder = DocumentBuilder(config)
document = builder.build_document(doc_provider)
bad_ocr_results = line_builder.ocr_error_detection(
document.pages, doc_provider.page_lines
)
assert len(bad_ocr_results.labels) == 2
assert all([label == "good" for label in bad_ocr_results.labels])
```
--------------------------------------------------------------------------------
/marker/processors/order.py:
--------------------------------------------------------------------------------
```python
from statistics import mean
from collections import defaultdict
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
class OrderProcessor(BaseProcessor):
"""
A processor for sorting the blocks in order if needed. This can help when the layout image was sliced.
"""
block_types = tuple()
def __call__(self, document: Document):
for page in document.pages:
# Skip OCRed pages
if page.text_extraction_method != "pdftext":
continue
# Skip pages without layout slicing
if not page.layout_sliced:
continue
block_idxs = defaultdict(int)
for block_id in page.structure:
block = document.get_block(block_id)
spans = block.contained_blocks(document, (BlockTypes.Span, ))
if len(spans) == 0:
continue
# Avg span position in original PDF
block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2
for block_id in page.structure:
# Already assigned block id via span position
if block_idxs[block_id] > 0:
continue
block = document.get_block(block_id)
prev_block = document.get_prev_block(block)
next_block = document.get_next_block(block)
block_idx_add = 0
if prev_block:
block_idx_add = 1
while prev_block and prev_block.id not in block_idxs:
prev_block = document.get_prev_block(prev_block)
block_idx_add += 1
if not prev_block:
block_idx_add = -1
while next_block and next_block.id not in block_idxs:
next_block = document.get_next_block(next_block)
block_idx_add -= 1
if not next_block and not prev_block:
pass
elif prev_block:
block_idxs[block_id] = block_idxs[prev_block.id] + block_idx_add
else:
block_idxs[block_id] = block_idxs[next_block.id] + block_idx_add
page.structure = sorted(page.structure, key=lambda x: block_idxs[x])
```
--------------------------------------------------------------------------------
/tests/converters/test_extraction_converter.py:
--------------------------------------------------------------------------------
```python
import json
import pytest
from marker.converters.extraction import ExtractionConverter
from marker.extractors.page import PageExtractionSchema
from marker.extractors.document import DocumentExtractionSchema
from marker.services import BaseService
class MockLLMService(BaseService):
def __call__(self, prompt, image=None, page=None, response_schema=None, **kwargs):
if response_schema == PageExtractionSchema:
return {
"description": "Mock extraction description",
"detailed_notes": "Mock detailed notes for page extraction",
}
elif response_schema == DocumentExtractionSchema:
return {
"analysis": "Mock document analysis",
"document_json": json.dumps({"test_key": "test_value"}),
}
return {}
@pytest.fixture
def mock_llm_service():
return MockLLMService
@pytest.fixture
def extraction_converter(config, model_dict, mock_llm_service):
test_schema = {
"title": "TestSchema",
"type": "object",
"properties": {"test_key": {"title": "Test Key", "type": "string"}},
"required": ["test_key"],
}
config["page_schema"] = json.dumps(test_schema)
config["output_format"] = "markdown"
model_dict["llm_service"] = mock_llm_service
converter = ExtractionConverter(
artifact_dict=model_dict, processor_list=None, config=config
)
converter.llm_service = mock_llm_service
converter.default_llm_service = MockLLMService
return converter
@pytest.mark.config({"page_range": [0]})
def test_extraction_converter(config, model_dict, mock_llm_service, temp_doc):
config["page_schema"] = "invalid json"
model_dict["llm_service"] = mock_llm_service
converter = ExtractionConverter(
artifact_dict=model_dict, processor_list=None, config=config
)
converter.artifact_dict["llm_service"] = mock_llm_service()
results = converter(temp_doc.name)
assert results.document_json == '{"test_key": "test_value"}'
@pytest.mark.config({"page_range": [0, 1]})
def test_extraction_converter_multiple_pages(extraction_converter, temp_doc):
result = extraction_converter(temp_doc.name)
assert result is not None
assert result.document_json is not None
assert json.loads(result.document_json) == {"test_key": "test_value"}
assert result.analysis == "Mock document analysis"
```
--------------------------------------------------------------------------------
/marker/processors/llm/llm_meta.py:
--------------------------------------------------------------------------------
```python
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict, Any
from marker.logger import get_logger
from tqdm import tqdm
from marker.processors.llm import BaseLLMSimpleBlockProcessor, BaseLLMProcessor
from marker.schema.document import Document
from marker.services import BaseService
logger = get_logger()
class LLMSimpleBlockMetaProcessor(BaseLLMProcessor):
"""
A wrapper for simple LLM processors, so they can all run in parallel.
"""
def __init__(
self,
processor_lst: List[BaseLLMSimpleBlockProcessor],
llm_service: BaseService,
config=None,
):
super().__init__(llm_service, config)
self.processors = processor_lst
def __call__(self, document: Document):
if not self.use_llm or self.llm_service is None:
return
total = sum(
[len(processor.inference_blocks(document)) for processor in self.processors]
)
pbar = tqdm(
desc="LLM processors running", disable=self.disable_tqdm, total=total
)
all_prompts = [
processor.block_prompts(document) for processor in self.processors
]
pending = []
futures_map = {}
with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
for i, prompt_lst in enumerate(all_prompts):
for prompt in prompt_lst:
future = executor.submit(self.get_response, prompt)
pending.append(future)
futures_map[future] = {"processor_idx": i, "prompt_data": prompt}
for future in pending:
try:
result = future.result()
future_data = futures_map.pop(future)
processor: BaseLLMSimpleBlockProcessor = self.processors[
future_data["processor_idx"]
]
# finalize the result
processor(result, future_data["prompt_data"], document)
except Exception as e:
logger.warning(f"Error processing LLM response: {e}")
pbar.update(1)
pbar.close()
def get_response(self, prompt_data: Dict[str, Any]):
return self.llm_service(
prompt_data["prompt"],
prompt_data["image"],
prompt_data["block"],
prompt_data["schema"],
)
```
--------------------------------------------------------------------------------
/marker/converters/__init__.py:
--------------------------------------------------------------------------------
```python
import inspect
from typing import Optional, List, Type
from pydantic import BaseModel
from marker.processors import BaseProcessor
from marker.processors.llm import BaseLLMSimpleBlockProcessor
from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor
from marker.util import assign_config, download_font
class BaseConverter:
def __init__(self, config: Optional[BaseModel | dict] = None):
assign_config(self, config)
self.config = config
self.llm_service = None
# Download render font, needed for some providers
download_font()
def __call__(self, *args, **kwargs):
raise NotImplementedError
def resolve_dependencies(self, cls):
init_signature = inspect.signature(cls.__init__)
parameters = init_signature.parameters
resolved_kwargs = {}
for param_name, param in parameters.items():
if param_name == 'self':
continue
elif param_name == 'config':
resolved_kwargs[param_name] = self.config
elif param.name in self.artifact_dict:
resolved_kwargs[param_name] = self.artifact_dict[param_name]
elif param.default != inspect.Parameter.empty:
resolved_kwargs[param_name] = param.default
else:
raise ValueError(f"Cannot resolve dependency for parameter: {param_name}")
return cls(**resolved_kwargs)
def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) -> List[BaseProcessor]:
processors = []
for processor_cls in processor_cls_lst:
processors.append(self.resolve_dependencies(processor_cls))
simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)]
if not simple_llm_processors:
return processors
llm_positions = [i for i, p in enumerate(processors) if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
insert_position = max(0, llm_positions[-1] - len(simple_llm_processors) + 1)
meta_processor = LLMSimpleBlockMetaProcessor(
processor_lst=simple_llm_processors,
llm_service=self.llm_service,
config=self.config,
)
other_processors.insert(insert_position, meta_processor)
return other_processors
```
--------------------------------------------------------------------------------
/marker/providers/registry.py:
--------------------------------------------------------------------------------
```python
import filetype
import filetype.match as file_match
from bs4 import BeautifulSoup
from filetype.types import archive, document, IMAGE
from marker.providers.document import DocumentProvider
from marker.providers.epub import EpubProvider
from marker.providers.html import HTMLProvider
from marker.providers.image import ImageProvider
from marker.providers.pdf import PdfProvider
from marker.providers.powerpoint import PowerPointProvider
from marker.providers.spreadsheet import SpreadSheetProvider
DOCTYPE_MATCHERS = {
"image": IMAGE,
"pdf": [
archive.Pdf,
],
"epub": [
archive.Epub,
],
"doc": [document.Docx],
"xls": [document.Xlsx],
"ppt": [document.Pptx],
}
def load_matchers(doctype: str):
return [cls() for cls in DOCTYPE_MATCHERS[doctype]]
def load_extensions(doctype: str):
return [cls.EXTENSION for cls in DOCTYPE_MATCHERS[doctype]]
def provider_from_ext(filepath: str):
ext = filepath.rsplit(".", 1)[-1].strip()
if not ext:
return PdfProvider
if ext in load_extensions("image"):
return ImageProvider
if ext in load_extensions("pdf"):
return PdfProvider
if ext in load_extensions("doc"):
return DocumentProvider
if ext in load_extensions("xls"):
return SpreadSheetProvider
if ext in load_extensions("ppt"):
return PowerPointProvider
if ext in load_extensions("epub"):
return EpubProvider
if ext in ["html"]:
return HTMLProvider
return PdfProvider
def provider_from_filepath(filepath: str):
if filetype.image_match(filepath) is not None:
return ImageProvider
if file_match(filepath, load_matchers("pdf")) is not None:
return PdfProvider
if file_match(filepath, load_matchers("epub")) is not None:
return EpubProvider
if file_match(filepath, load_matchers("doc")) is not None:
return DocumentProvider
if file_match(filepath, load_matchers("xls")) is not None:
return SpreadSheetProvider
if file_match(filepath, load_matchers("ppt")) is not None:
return PowerPointProvider
try:
with open(filepath, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
# Check if there are any HTML tags
if bool(soup.find()):
return HTMLProvider
except Exception:
pass
# Fallback if we incorrectly detect the file type
return provider_from_ext(filepath)
```
--------------------------------------------------------------------------------
/marker/processors/blank_page.py:
--------------------------------------------------------------------------------
```python
from typing import Annotated
from PIL import Image
import numpy as np
import cv2
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Block
from marker.schema.document import Document
from marker.logger import get_logger
logger = get_logger()
class BlankPageProcessor(BaseProcessor):
"""
A processor to filter out blank pages detected as a single layout block
"""
full_page_block_intersection_threshold: Annotated[
float, "Threshold to detect blank pages at"
] = 0.8
filter_blank_pages: Annotated[bool, "Remove blank pages detected as images."] = (
False
)
def is_blank(self, image: Image.Image):
image = np.asarray(image)
if image.size == 0 or image.shape[0] == 0 or image.shape[1] == 0:
# Handle empty image case
return True
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
gray = cv2.GaussianBlur(gray, (7, 7), 0)
# Adaptive threshold (inverse for text as white)
binarized = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15
)
num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
binarized, connectivity=8
)
cleaned = np.zeros_like(binarized)
for i in range(1, num_labels): # skip background
cleaned[labels == i] = 255
kernel = np.ones((1, 5), np.uint8)
dilated = cv2.dilate(cleaned, kernel, iterations=3)
b = dilated / 255
return b.sum() == 0
def __call__(self, document: Document):
if not self.filter_blank_pages:
return
for page in document.pages:
structure_blocks = page.structure_blocks(document)
if not structure_blocks or len(structure_blocks) > 1:
continue
full_page_block: Block = structure_blocks[0]
conditions = [
full_page_block.block_type in [BlockTypes.Picture, BlockTypes.Figure],
self.is_blank(full_page_block.get_image(document)),
page.polygon.intersection_area(full_page_block.polygon)
> self.full_page_block_intersection_threshold,
]
if all(conditions):
logger.debug(f"Removing blank block {full_page_block.id}")
page.remove_structure_items([full_page_block.id])
full_page_block.removed = True
```
--------------------------------------------------------------------------------
/marker/processors/util.py:
--------------------------------------------------------------------------------
```python
import re
from bs4 import BeautifulSoup
from marker.schema import BlockTypes
from marker.schema.groups import PageGroup
from marker.schema.registry import get_block_class
from marker.schema.text import Line
def escape_latex_commands(text: str):
text = (text
.replace('\n', '\\n')
.replace('\t', '\\t')
.replace('\r', '\\r'))
return text
def add_math_spans_to_line(corrected_text: str, text_line: Line, page: PageGroup):
SpanClass = get_block_class(BlockTypes.Span)
corrected_spans = text_to_spans(corrected_text)
for span_idx, span in enumerate(corrected_spans):
if span_idx == len(corrected_spans) - 1:
span['content'] += "\n"
span_block = page.add_full_block(
SpanClass(
polygon=text_line.polygon,
text=span['content'],
font='Unknown',
font_weight=0,
font_size=0,
minimum_position=0,
maximum_position=0,
formats=[span['type']],
url=span.get('url'),
page_id=text_line.page_id,
text_extraction_method="gemini",
has_superscript=span["has_superscript"],
has_subscript=span["has_subscript"]
)
)
text_line.structure.append(span_block.id)
def text_to_spans(text):
soup = BeautifulSoup(text, 'html.parser')
tag_types = {
'b': 'bold',
'i': 'italic',
'math': 'math',
'sub': 'plain',
'sup': 'plain',
'span': 'plain'
}
spans = []
for element in soup.descendants:
if not len(list(element.parents)) == 1:
continue
url = element.attrs.get('href') if hasattr(element, 'attrs') else None
if element.name in tag_types:
text = element.get_text()
if element.name == "math":
text = escape_latex_commands(text)
spans.append({
'type': tag_types[element.name],
'content': text,
'url': url,
"has_superscript": element.name == "sup",
"has_subscript": element.name == "sub"
})
elif element.string:
spans.append({
'type': 'plain',
'content': element.string,
'url': url,
"has_superscript": False,
"has_subscript": False
})
return spans
```
--------------------------------------------------------------------------------
/marker/providers/__init__.py:
--------------------------------------------------------------------------------
```python
from copy import deepcopy
from typing import List, Optional, Dict
from PIL import Image
from pydantic import BaseModel
from pdftext.schema import Reference
from marker.logger import configure_logging
from marker.schema.polygon import PolygonBox
from marker.schema.text import Span
from marker.schema.text.char import Char
from marker.schema.text.line import Line
from marker.settings import settings
from marker.util import assign_config
configure_logging()
class ProviderOutput(BaseModel):
line: Line
spans: List[Span]
chars: Optional[List[List[Char]]] = None
@property
def raw_text(self):
return "".join(span.text for span in self.spans)
def __hash__(self):
return hash(tuple(self.line.polygon.bbox))
def merge(self, other: "ProviderOutput"):
new_output = deepcopy(self)
other_copy = deepcopy(other)
new_output.spans.extend(other_copy.spans)
if new_output.chars is not None and other_copy.chars is not None:
new_output.chars.extend(other_copy.chars)
elif other_copy.chars is not None:
new_output.chars = other_copy.chars
new_output.line.polygon = new_output.line.polygon.merge(
[other_copy.line.polygon]
)
return new_output
ProviderPageLines = Dict[int, List[ProviderOutput]]
class BaseProvider:
def __init__(self, filepath: str, config: Optional[BaseModel | dict] = None):
assign_config(self, config)
self.filepath = filepath
def __len__(self):
pass
def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
pass
def get_page_bbox(self, idx: int) -> PolygonBox | None:
pass
def get_page_lines(self, idx: int) -> List[Line]:
pass
def get_page_refs(self, idx: int) -> List[Reference]:
pass
def __enter__(self):
return self
@staticmethod
def get_font_css():
from weasyprint import CSS
from weasyprint.text.fonts import FontConfiguration
font_config = FontConfiguration()
css = CSS(
string=f"""
@font-face {{
font-family: GoNotoCurrent-Regular;
src: url({settings.FONT_PATH});
font-display: swap;
}}
body {{
font-family: {settings.FONT_NAME.split(".")[0]}, sans-serif;
font-variant-ligatures: none;
font-feature-settings: "liga" 0;
text-rendering: optimizeLegibility;
}}
""",
font_config=font_config,
)
return css
```
--------------------------------------------------------------------------------
/marker/providers/document.py:
--------------------------------------------------------------------------------
```python
import base64
import os
import re
import tempfile
from io import BytesIO
from PIL import Image
from marker.logger import get_logger
from marker.providers.pdf import PdfProvider
logger = get_logger()
css = """
@page {
size: A4;
margin: 2cm;
}
img {
max-width: 100%;
max-height: 25cm;
object-fit: contain;
margin: 12pt auto;
}
div, p {
max-width: 100%;
word-break: break-word;
font-size: 10pt;
}
table {
width: 100%;
border-collapse: collapse;
break-inside: auto;
font-size: 10pt;
}
tr {
break-inside: avoid;
page-break-inside: avoid;
}
td {
border: 0.75pt solid #000;
padding: 6pt;
}
"""
class DocumentProvider(PdfProvider):
def __init__(self, filepath: str, config=None):
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
self.temp_pdf_path = temp_pdf.name
temp_pdf.close()
# Convert DOCX to PDF
try:
self.convert_docx_to_pdf(filepath)
except Exception as e:
raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")
# Initialize the PDF provider with the temp pdf path
super().__init__(self.temp_pdf_path, config)
def __del__(self):
if os.path.exists(self.temp_pdf_path):
os.remove(self.temp_pdf_path)
def convert_docx_to_pdf(self, filepath: str):
from weasyprint import CSS, HTML
import mammoth
with open(filepath, "rb") as docx_file:
# we convert the docx to HTML
result = mammoth.convert_to_html(docx_file)
html = result.value
# We convert the HTML into a PDF
HTML(string=self._preprocess_base64_images(html)).write_pdf(
self.temp_pdf_path, stylesheets=[CSS(string=css), self.get_font_css()]
)
@staticmethod
def _preprocess_base64_images(html_content):
pattern = r'data:([^;]+);base64,([^"\'>\s]+)'
def convert_image(match):
try:
img_data = base64.b64decode(match.group(2))
with BytesIO(img_data) as bio:
with Image.open(bio) as img:
output = BytesIO()
img.save(output, format=img.format)
new_base64 = base64.b64encode(output.getvalue()).decode()
return f"data:{match.group(1)};base64,{new_base64}"
except Exception as e:
logger.error(f"Failed to process image: {e}")
return "" # we ditch broken images as that breaks the PDF creation down the line
return re.sub(pattern, convert_image, html_content)
```
--------------------------------------------------------------------------------
/benchmarks/overall/methods/olmocr.py:
--------------------------------------------------------------------------------
```python
import base64
import json
import tempfile
import time
from io import BytesIO
import torch
from PIL import Image
from benchmarks.overall.methods import BaseMethod, BenchmarkResult
def convert_single_page(filename: str, model, processor, device):
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts import build_finetuning_prompt
from olmocr.prompts.anchor import get_anchor_text
image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1024)
# Build the prompt, using document metadata
anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=4000)
prompt = build_finetuning_prompt(anchor_text)
# Build the full prompt
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
],
}
]
# Apply the chat template and processor
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
main_image = Image.open(BytesIO(base64.b64decode(image_base64)))
inputs = processor(
text=[text],
images=[main_image],
padding=True,
return_tensors="pt",
)
inputs = {key: value.to(device) for (key, value) in inputs.items()}
# Generate the output
output = model.generate(
**inputs,
temperature=0.8,
max_new_tokens=8192,
num_return_sequences=1,
do_sample=True,
)
# Decode the output
prompt_length = inputs["input_ids"].shape[1]
new_tokens = output[:, prompt_length:]
text_output = processor.tokenizer.batch_decode(
new_tokens, skip_special_tokens=True
)[0]
try:
text_output = json.loads(text_output)
text = text_output["natural_text"]
except Exception:
try:
text = text_output.split("natural_text")[1].strip()
except Exception:
text = ""
return text
class OlmOCRMethod(BaseMethod):
olmocr_model: dict = None
use_llm: bool = False
def __call__(self, sample) -> BenchmarkResult:
pdf_bytes = sample["pdf"] # This is a single page PDF
with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
f.write(pdf_bytes)
start = time.time()
result = convert_single_page(f.name, self.olmocr_model["model"], self.olmocr_model["processor"], self.olmocr_model["model"].device)
total = time.time() - start
return {
"markdown": result,
"time": total
}
```
--------------------------------------------------------------------------------
/tests/renderers/test_markdown_renderer.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.renderers.markdown import MarkdownRenderer
from marker.schema import BlockTypes
from marker.schema.blocks import TableCell
@pytest.mark.config({"page_range": [0], "disable_ocr": True})
def test_markdown_renderer(pdf_document):
renderer = MarkdownRenderer()
md = renderer(pdf_document).markdown
# Verify markdown
assert "# Subspace Adversarial Training" in md
@pytest.mark.config({"page_range": [0]})
def test_markdown_renderer_auto_ocr(pdf_document):
renderer = MarkdownRenderer()
md = renderer(pdf_document).markdown
# Verify markdown
assert "Subspace Adversarial Training" in md
@pytest.mark.config({"page_range": [0, 1], "paginate_output": True})
def test_markdown_renderer_pagination(pdf_document):
renderer = MarkdownRenderer({"paginate_output": True})
md = renderer(pdf_document).markdown
assert "\n\n{0}-" in md
assert "\n\n{1}-" in md
@pytest.mark.config({"page_range": [0, 1], "paginate_output": True})
def test_markdown_renderer_pagination_blank_last_page(pdf_document):
# Clear all children and structure from the last page to simulate a blank page
last_page = pdf_document.pages[-1]
last_page.children = []
last_page.structure = []
renderer = MarkdownRenderer({"paginate_output": True})
md = renderer(pdf_document).markdown
# Should end with pagination marker and preserve trailing newlines
assert md.endswith("}\n\n") or md.endswith(
"}------------------------------------------------\n\n"
)
@pytest.mark.config({"page_range": [0, 1]})
def test_markdown_renderer_metadata(pdf_document):
renderer = MarkdownRenderer({"paginate_output": True})
metadata = renderer(pdf_document).metadata
assert "table_of_contents" in metadata
@pytest.mark.config({"page_range": [0, 1]})
def test_markdown_renderer_images(pdf_document):
renderer = MarkdownRenderer({"extract_images": False})
markdown_output = renderer(pdf_document)
assert len(markdown_output.images) == 0
assert "
def test_markdown_renderer_tables(pdf_document):
table = pdf_document.contained_blocks((BlockTypes.Table,))[0]
page = pdf_document.pages[0]
cell = TableCell(
polygon=table.polygon,
text_lines=["54<i>.45</i>67<br>89<math>x</math>"],
rowspan=1,
colspan=1,
row_id=0,
col_id=0,
is_header=False,
page_id=page.page_id,
)
page.add_full_block(cell)
table.structure = []
table.add_structure(cell)
renderer = MarkdownRenderer()
md = renderer(pdf_document).markdown
assert "54 <i>.45</i> 67<br>89 $x$" in md
```
--------------------------------------------------------------------------------
/marker/schema/registry.py:
--------------------------------------------------------------------------------
```python
from typing import Dict, Type
from importlib import import_module
from marker.schema import BlockTypes
from marker.schema.blocks import (
Block,
Caption,
Code,
Equation,
Figure,
Footnote,
Form,
Handwriting,
InlineMath,
ListItem,
PageFooter,
PageHeader,
Picture,
SectionHeader,
Table,
TableOfContents,
Text,
ComplexRegion,
TableCell,
Reference,
)
from marker.schema.document import Document
from marker.schema.groups import (
FigureGroup,
ListGroup,
PageGroup,
PictureGroup,
TableGroup,
)
from marker.schema.text import Line, Span
from marker.schema.text.char import Char
BLOCK_REGISTRY: Dict[BlockTypes, str] = {}
def register_block_class(block_type: BlockTypes, block_cls: Type[Block]):
BLOCK_REGISTRY[block_type] = f"{block_cls.__module__}.{block_cls.__name__}"
def get_block_class(block_type: BlockTypes) -> Type[Block]:
class_path = BLOCK_REGISTRY[block_type]
module_name, class_name = class_path.rsplit(".", 1)
module = import_module(module_name)
return getattr(module, class_name)
register_block_class(BlockTypes.Line, Line)
register_block_class(BlockTypes.Span, Span)
register_block_class(BlockTypes.Char, Char)
register_block_class(BlockTypes.FigureGroup, FigureGroup)
register_block_class(BlockTypes.TableGroup, TableGroup)
register_block_class(BlockTypes.ListGroup, ListGroup)
register_block_class(BlockTypes.PictureGroup, PictureGroup)
register_block_class(BlockTypes.Page, PageGroup)
register_block_class(BlockTypes.Caption, Caption)
register_block_class(BlockTypes.Code, Code)
register_block_class(BlockTypes.Figure, Figure)
register_block_class(BlockTypes.Footnote, Footnote)
register_block_class(BlockTypes.Form, Form)
register_block_class(BlockTypes.Equation, Equation)
register_block_class(BlockTypes.Handwriting, Handwriting)
register_block_class(BlockTypes.TextInlineMath, InlineMath)
register_block_class(BlockTypes.ListItem, ListItem)
register_block_class(BlockTypes.PageFooter, PageFooter)
register_block_class(BlockTypes.PageHeader, PageHeader)
register_block_class(BlockTypes.Picture, Picture)
register_block_class(BlockTypes.SectionHeader, SectionHeader)
register_block_class(BlockTypes.Table, Table)
register_block_class(BlockTypes.Text, Text)
register_block_class(BlockTypes.TableOfContents, TableOfContents)
register_block_class(BlockTypes.ComplexRegion, ComplexRegion)
register_block_class(BlockTypes.TableCell, TableCell)
register_block_class(BlockTypes.Reference, Reference)
register_block_class(BlockTypes.Document, Document)
assert len(BLOCK_REGISTRY) == len(BlockTypes)
assert all(
[
get_block_class(k).model_fields["block_type"].default == k
for k, _ in BLOCK_REGISTRY.items()
]
)
```
--------------------------------------------------------------------------------
/marker/processors/blockquote.py:
--------------------------------------------------------------------------------
```python
from typing import Annotated, Tuple
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
class BlockquoteProcessor(BaseProcessor):
"""
A processor for tagging blockquotes.
"""
block_types: Annotated[
Tuple[BlockTypes],
"The block types to process.",
] = (BlockTypes.Text, BlockTypes.TextInlineMath)
min_x_indent: Annotated[
float,
"The minimum horizontal indentation required to consider a block as part of a blockquote.",
"Expressed as a percentage of the block width.",
] = 0.1
x_start_tolerance: Annotated[
float,
"The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.",
"Expressed as a percentage of the block width.",
] = 0.01
x_end_tolerance: Annotated[
float,
"The maximum allowable difference between the ending x-coordinates of consecutive blocks to consider them aligned.",
"Expressed as a percentage of the block width.",
] = 0.01
def __init__(self, config):
super().__init__(config)
def __call__(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
if block.structure is None:
continue
if not len(block.structure) >= 2:
continue
next_block = page.get_next_block(block)
if next_block is None:
continue
if next_block.block_type not in self.block_types:
continue
if next_block.structure is None:
continue
if next_block.ignore_for_output:
continue
matching_x_end = abs(next_block.polygon.x_end - block.polygon.x_end) < self.x_end_tolerance * block.polygon.width
matching_x_start = abs(next_block.polygon.x_start - block.polygon.x_start) < self.x_start_tolerance * block.polygon.width
x_indent = next_block.polygon.x_start > block.polygon.x_start + (self.min_x_indent * block.polygon.width)
y_indent = next_block.polygon.y_start > block.polygon.y_end
if block.blockquote:
next_block.blockquote = (matching_x_end and matching_x_start) or (x_indent and y_indent)
next_block.blockquote_level = block.blockquote_level
if (x_indent and y_indent):
next_block.blockquote_level += 1
elif len(next_block.structure) >= 2 and (x_indent and y_indent):
next_block.blockquote = True
next_block.blockquote_level = 1
```
--------------------------------------------------------------------------------
/tests/services/test_service_init.py:
--------------------------------------------------------------------------------
```python
import pytest
from marker.converters.pdf import PdfConverter
from marker.services.gemini import GoogleGeminiService
from marker.services.ollama import OllamaService
from marker.services.vertex import GoogleVertexService
from marker.services.openai import OpenAIService
from marker.services.azure_openai import AzureOpenAIService
@pytest.mark.output_format("markdown")
@pytest.mark.config({"page_range": [0]})
def test_empty_llm(pdf_converter: PdfConverter, temp_doc):
assert pdf_converter.artifact_dict["llm_service"] is None
assert pdf_converter.llm_service is None
def test_llm_no_keys(model_dict, config):
with pytest.raises(AssertionError):
PdfConverter(artifact_dict=model_dict, config={"use_llm": True})
@pytest.mark.output_format("markdown")
@pytest.mark.config({"page_range": [0], "use_llm": True, "gemini_api_key": "test"})
def test_llm_gemini(pdf_converter: PdfConverter, temp_doc):
assert pdf_converter.artifact_dict["llm_service"] is not None
assert isinstance(pdf_converter.llm_service, GoogleGeminiService)
@pytest.mark.output_format("markdown")
@pytest.mark.config(
{
"page_range": [0],
"use_llm": True,
"vertex_project_id": "test",
"llm_service": "marker.services.vertex.GoogleVertexService",
}
)
def test_llm_vertex(pdf_converter: PdfConverter, temp_doc):
assert pdf_converter.artifact_dict["llm_service"] is not None
assert isinstance(pdf_converter.llm_service, GoogleVertexService)
@pytest.mark.output_format("markdown")
@pytest.mark.config(
{
"page_range": [0],
"use_llm": True,
"llm_service": "marker.services.ollama.OllamaService",
}
)
def test_llm_ollama(pdf_converter: PdfConverter, temp_doc):
assert pdf_converter.artifact_dict["llm_service"] is not None
assert isinstance(pdf_converter.llm_service, OllamaService)
@pytest.mark.output_format("markdown")
@pytest.mark.config(
{
"page_range": [0],
"use_llm": True,
"llm_service": "marker.services.openai.OpenAIService",
"openai_api_key": "test",
}
)
def test_llm_openai(pdf_converter: PdfConverter, temp_doc):
assert pdf_converter.artifact_dict["llm_service"] is not None
assert isinstance(pdf_converter.llm_service, OpenAIService)
@pytest.mark.output_format("markdown")
@pytest.mark.config(
{
"page_range": [0],
"use_llm": True,
"llm_service": "marker.services.azure_openai.AzureOpenAIService",
"azure_endpoint": "https://example.openai.azure.com",
"azure_api_key": "test",
"deployment_name": "test-model",
"azure_api_version": "1",
}
)
def test_llm_azure_openai(pdf_converter: PdfConverter, temp_doc):
assert pdf_converter.artifact_dict["llm_service"] is not None
assert isinstance(pdf_converter.llm_service, AzureOpenAIService)
```
--------------------------------------------------------------------------------
/marker/converters/extraction.py:
--------------------------------------------------------------------------------
```python
import re
from typing import Annotated
from marker.builders.document import DocumentBuilder
from marker.builders.line import LineBuilder
from marker.builders.ocr import OcrBuilder
from marker.builders.structure import StructureBuilder
from marker.converters.pdf import PdfConverter
from marker.extractors.document import DocumentExtractor
from marker.extractors.page import PageExtractor
from marker.providers.registry import provider_from_filepath
from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput
from marker.renderers.markdown import MarkdownRenderer
from marker.logger import get_logger
logger = get_logger()
class ExtractionConverter(PdfConverter):
pattern: str = r"{\d+\}-{48}\n\n"
existing_markdown: Annotated[
str, "Markdown that was already converted for extraction."
] = None
def build_document(self, filepath: str):
provider_cls = provider_from_filepath(filepath)
layout_builder = self.resolve_dependencies(self.layout_builder_class)
line_builder = self.resolve_dependencies(LineBuilder)
ocr_builder = self.resolve_dependencies(OcrBuilder)
provider = provider_cls(filepath, self.config)
document = DocumentBuilder(self.config)(
provider, layout_builder, line_builder, ocr_builder
)
structure_builder_cls = self.resolve_dependencies(StructureBuilder)
structure_builder_cls(document)
for processor in self.processor_list:
processor(document)
return document, provider
def __call__(self, filepath: str) -> ExtractionOutput:
self.config["paginate_output"] = True # Ensure we can split the output properly
self.config["output_format"] = (
"markdown" # Output must be markdown for extraction
)
markdown = self.existing_markdown
if not markdown:
document, provider = self.build_document(filepath)
self.page_count = len(document.pages)
renderer = self.resolve_dependencies(MarkdownRenderer)
output = renderer(document)
markdown = output.markdown
output_pages = re.split(self.pattern, markdown)[1:] # Split output into pages
# This needs an LLM service for extraction, this sets it in the extractor
if self.artifact_dict.get("llm_service") is None:
self.artifact_dict["llm_service"] = self.resolve_dependencies(
self.default_llm_service
)
page_extractor = self.resolve_dependencies(PageExtractor)
document_extractor = self.resolve_dependencies(DocumentExtractor)
renderer = self.resolve_dependencies(ExtractionRenderer)
# Inference in parallel
notes = page_extractor(output_pages)
document_output = document_extractor(notes)
merged = renderer(document_output, markdown)
return merged
```