#
tokens: 49798/50000 152/235 files (page 1/7)
lines: off (toggle) GitHub
raw markdown copy
This is page 1 of 7. Use http://codebase.md/datalab-to/marker?page={x} to view the full context.

# Directory Structure

```
├── .github
│   ├── ISSUE_TEMPLATE
│   │   ├── breaking-bug-report.md
│   │   ├── feature_request.md
│   │   └── output-bug-report.md
│   └── workflows
│       ├── benchmarks.yml
│       ├── ci.yml
│       ├── cla.yml
│       ├── publish.yml
│       └── scripts.yml
├── .gitignore
├── .pre-commit-config.yaml
├── benchmarks
│   ├── __init__.py
│   ├── overall
│   │   ├── __init__.py
│   │   ├── display
│   │   │   ├── __init__.py
│   │   │   ├── dataset.py
│   │   │   └── table.py
│   │   ├── download
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── llamaparse.py
│   │   │   ├── main.py
│   │   │   ├── mathpix.py
│   │   │   └── mistral.py
│   │   ├── elo.py
│   │   ├── methods
│   │   │   ├── __init__.py
│   │   │   ├── docling.py
│   │   │   ├── gt.py
│   │   │   ├── llamaparse.py
│   │   │   ├── marker.py
│   │   │   ├── mathpix.py
│   │   │   ├── mistral.py
│   │   │   ├── olmocr.py
│   │   │   └── schema.py
│   │   ├── overall.py
│   │   ├── registry.py
│   │   ├── schema.py
│   │   └── scorers
│   │       ├── __init__.py
│   │       ├── clean.py
│   │       ├── heuristic.py
│   │       ├── llm.py
│   │       └── schema.py
│   ├── table
│   │   ├── __init__.py
│   │   ├── gemini.py
│   │   ├── inference.py
│   │   ├── scoring.py
│   │   └── table.py
│   ├── throughput
│   │   ├── __init__.py
│   │   └── main.py
│   └── verify_scores.py
├── chunk_convert.py
├── CLA.md
├── convert_single.py
├── convert.py
├── data
│   ├── .gitignore
│   ├── examples
│   │   ├── json
│   │   │   ├── multicolcnn.json
│   │   │   ├── switch_trans.json
│   │   │   └── thinkpython.json
│   │   └── markdown
│   │       ├── multicolcnn
│   │       │   ├── _page_1_Figure_0.jpeg
│   │       │   ├── _page_2_Picture_0.jpeg
│   │       │   ├── _page_6_Figure_0.jpeg
│   │       │   ├── _page_7_Figure_0.jpeg
│   │       │   ├── multicolcnn_meta.json
│   │       │   └── multicolcnn.md
│   │       ├── switch_transformers
│   │       │   ├── _page_11_Figure_4.jpeg
│   │       │   ├── _page_12_Figure_4.jpeg
│   │       │   ├── _page_13_Figure_2.jpeg
│   │       │   ├── _page_18_Figure_1.jpeg
│   │       │   ├── _page_18_Figure_3.jpeg
│   │       │   ├── _page_2_Figure_3.jpeg
│   │       │   ├── _page_20_Figure_1.jpeg
│   │       │   ├── _page_20_Figure_4.jpeg
│   │       │   ├── _page_27_Figure_1.jpeg
│   │       │   ├── _page_29_Figure_1.jpeg
│   │       │   ├── _page_30_Figure_1.jpeg
│   │       │   ├── _page_31_Figure_3.jpeg
│   │       │   ├── _page_4_Figure_1.jpeg
│   │       │   ├── _page_5_Figure_3.jpeg
│   │       │   ├── switch_trans_meta.json
│   │       │   └── switch_trans.md
│   │       └── thinkpython
│   │           ├── _page_109_Figure_1.jpeg
│   │           ├── _page_115_Figure_1.jpeg
│   │           ├── _page_116_Figure_3.jpeg
│   │           ├── _page_127_Figure_1.jpeg
│   │           ├── _page_128_Figure_1.jpeg
│   │           ├── _page_167_Figure_1.jpeg
│   │           ├── _page_169_Figure_1.jpeg
│   │           ├── _page_173_Figure_1.jpeg
│   │           ├── _page_190_Figure_1.jpeg
│   │           ├── _page_195_Figure_1.jpeg
│   │           ├── _page_205_Figure_1.jpeg
│   │           ├── _page_23_Figure_1.jpeg
│   │           ├── _page_23_Figure_3.jpeg
│   │           ├── _page_230_Figure_1.jpeg
│   │           ├── _page_233_Figure_1.jpeg
│   │           ├── _page_233_Figure_3.jpeg
│   │           ├── _page_234_Figure_1.jpeg
│   │           ├── _page_235_Figure_1.jpeg
│   │           ├── _page_236_Figure_1.jpeg
│   │           ├── _page_236_Figure_3.jpeg
│   │           ├── _page_237_Figure_1.jpeg
│   │           ├── _page_238_Figure_1.jpeg
│   │           ├── _page_46_Figure_1.jpeg
│   │           ├── _page_60_Figure_1.jpeg
│   │           ├── _page_60_Figure_3.jpeg
│   │           ├── _page_67_Figure_1.jpeg
│   │           ├── _page_71_Figure_1.jpeg
│   │           ├── _page_78_Figure_1.jpeg
│   │           ├── _page_85_Figure_1.jpeg
│   │           ├── _page_94_Figure_1.jpeg
│   │           ├── _page_99_Figure_17.jpeg
│   │           ├── _page_99_Figure_178.jpeg
│   │           ├── thinkpython_meta.json
│   │           └── thinkpython.md
│   ├── images
│   │   ├── overall.png
│   │   ├── per_doc.png
│   │   └── table.png
│   └── latex_to_md.sh
├── examples
│   ├── marker_modal_deployment.py
│   └── README.md
├── extraction_app.py
├── LICENSE
├── marker
│   ├── builders
│   │   ├── __init__.py
│   │   ├── document.py
│   │   ├── layout.py
│   │   ├── line.py
│   │   ├── ocr.py
│   │   └── structure.py
│   ├── config
│   │   ├── __init__.py
│   │   ├── crawler.py
│   │   ├── parser.py
│   │   └── printer.py
│   ├── converters
│   │   ├── __init__.py
│   │   ├── extraction.py
│   │   ├── ocr.py
│   │   ├── pdf.py
│   │   └── table.py
│   ├── extractors
│   │   ├── __init__.py
│   │   ├── document.py
│   │   └── page.py
│   ├── logger.py
│   ├── models.py
│   ├── output.py
│   ├── processors
│   │   ├── __init__.py
│   │   ├── blank_page.py
│   │   ├── block_relabel.py
│   │   ├── blockquote.py
│   │   ├── code.py
│   │   ├── debug.py
│   │   ├── document_toc.py
│   │   ├── equation.py
│   │   ├── footnote.py
│   │   ├── ignoretext.py
│   │   ├── line_merge.py
│   │   ├── line_numbers.py
│   │   ├── list.py
│   │   ├── llm
│   │   │   ├── __init__.py
│   │   │   ├── llm_complex.py
│   │   │   ├── llm_equation.py
│   │   │   ├── llm_form.py
│   │   │   ├── llm_handwriting.py
│   │   │   ├── llm_image_description.py
│   │   │   ├── llm_mathblock.py
│   │   │   ├── llm_meta.py
│   │   │   ├── llm_page_correction.py
│   │   │   ├── llm_sectionheader.py
│   │   │   ├── llm_table_merge.py
│   │   │   └── llm_table.py
│   │   ├── order.py
│   │   ├── page_header.py
│   │   ├── reference.py
│   │   ├── sectionheader.py
│   │   ├── table.py
│   │   ├── text.py
│   │   └── util.py
│   ├── providers
│   │   ├── __init__.py
│   │   ├── document.py
│   │   ├── epub.py
│   │   ├── html.py
│   │   ├── image.py
│   │   ├── pdf.py
│   │   ├── powerpoint.py
│   │   ├── registry.py
│   │   ├── spreadsheet.py
│   │   └── utils.py
│   ├── renderers
│   │   ├── __init__.py
│   │   ├── chunk.py
│   │   ├── extraction.py
│   │   ├── html.py
│   │   ├── json.py
│   │   ├── markdown.py
│   │   └── ocr_json.py
│   ├── schema
│   │   ├── __init__.py
│   │   ├── blocks
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── basetable.py
│   │   │   ├── caption.py
│   │   │   ├── code.py
│   │   │   ├── complexregion.py
│   │   │   ├── equation.py
│   │   │   ├── figure.py
│   │   │   ├── footnote.py
│   │   │   ├── form.py
│   │   │   ├── handwriting.py
│   │   │   ├── inlinemath.py
│   │   │   ├── listitem.py
│   │   │   ├── pagefooter.py
│   │   │   ├── pageheader.py
│   │   │   ├── picture.py
│   │   │   ├── reference.py
│   │   │   ├── sectionheader.py
│   │   │   ├── table.py
│   │   │   ├── tablecell.py
│   │   │   ├── text.py
│   │   │   └── toc.py
│   │   ├── document.py
│   │   ├── groups
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── figure.py
│   │   │   ├── list.py
│   │   │   ├── page.py
│   │   │   ├── picture.py
│   │   │   └── table.py
│   │   ├── polygon.py
│   │   ├── registry.py
│   │   └── text
│   │       ├── __init__.py
│   │       ├── char.py
│   │       ├── line.py
│   │       └── span.py
│   ├── scripts
│   │   ├── __init__.py
│   │   ├── chunk_convert.py
│   │   ├── chunk_convert.sh
│   │   ├── common.py
│   │   ├── convert_single.py
│   │   ├── convert.py
│   │   ├── extraction_app.py
│   │   ├── file_to_s3.py
│   │   ├── run_streamlit_app.py
│   │   ├── server.py
│   │   └── streamlit_app.py
│   ├── services
│   │   ├── __init__.py
│   │   ├── azure_openai.py
│   │   ├── claude.py
│   │   ├── gemini.py
│   │   ├── ollama.py
│   │   ├── openai.py
│   │   └── vertex.py
│   ├── settings.py
│   ├── util.py
│   └── utils
│       ├── __init__.py
│       ├── batch.py
│       ├── gpu.py
│       └── image.py
├── marker_app.py
├── marker_server.py
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── README.md
├── signatures
│   └── version1
│       └── cla.json
├── static
│   └── fonts
│       └── .gitignore
└── tests
    ├── builders
    │   ├── test_blank_page.py
    │   ├── test_document_builder.py
    │   ├── test_garbled_pdf.py
    │   ├── test_layout_replace.py
    │   ├── test_ocr_builder.py
    │   ├── test_ocr_pipeline.py
    │   ├── test_overriding.py
    │   ├── test_pdf_links.py
    │   ├── test_rotated_bboxes.py
    │   ├── test_strip_existing_ocr.py
    │   └── test_structure.py
    ├── config
    │   └── test_config.py
    ├── conftest.py
    ├── converters
    │   ├── test_extraction_converter.py
    │   ├── test_ocr_converter.py
    │   ├── test_pdf_converter.py
    │   └── test_table_converter.py
    ├── processors
    │   ├── test_document_toc_processor.py
    │   ├── test_equation_processor.py
    │   ├── test_footnote_processor.py
    │   ├── test_ignoretext.py
    │   ├── test_llm_processors.py
    │   ├── test_table_merge.py
    │   └── test_table_processor.py
    ├── providers
    │   ├── test_document_providers.py
    │   ├── test_image_provider.py
    │   └── test_pdf_provider.py
    ├── renderers
    │   ├── test_chunk_renderer.py
    │   ├── test_extract_images.py
    │   ├── test_html_renderer.py
    │   ├── test_json_renderer.py
    │   └── test_markdown_renderer.py
    ├── schema
    │   └── groups
    │       └── test_list_grouping.py
    ├── services
    │   └── test_service_init.py
    └── utils.py
```

# Files

--------------------------------------------------------------------------------
/static/fonts/.gitignore:
--------------------------------------------------------------------------------

```
*
!.gitignore
```

--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------

```
latex
pdfs
references
```

--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------

```yaml
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
  # Ruff version.
  rev: v0.9.10
  hooks:
    # Run the linter.
    - id: ruff
      types_or: [ python, pyi ]
      args: [ --fix ]
    # Run the formatter.
    - id: ruff-format
      types_or: [ python, pyi ]
```

--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------

```
private.py
.DS_Store
local.env
experiments
test_data
training
wandb
*.dat
report.json
benchmark_data
debug_data
temp.md
temp
conversion_results
uploads
/cache

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

.vscode/
```

--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------

```markdown
## Usage Examples

This directory contains examples of running `marker` in different contexts.

### Usage with Modal

We have a [self-contained example](./marker_modal_deployment.py) that shows how you can quickly use [Modal](https://modal.com) to deploy `marker` by provisioning a container with a GPU, and expose that with an API so you can submit PDFs for conversion into Markdown, HTML, or JSON.

It's a limited example that you can extend into different use cases.

#### Pre-requisites

Make sure you have the `modal` client installed by [following their instructions here](https://modal.com/docs/guide#getting-started).

Modal's [Starter Plan](https://modal.com/pricing) includes $30 of free compute each month.
Modal is [serverless](https://arxiv.org/abs/1902.03383), so you only pay for resources when you are using them.

#### Running the example

Once `modal` is configured, you can deploy it to your workspace by running:

> modal deploy marker_modal_deployment.py

Notes:
- `marker` has a few models it uses. By default, the endpoint will check if these models are loaded and download them if not (first request will be slow). You can avoid this by running

> modal run marker_modal_deployment.py::download_models

Which will create a [`Modal Volume`](https://modal.com/docs/guide/Volumes) to store them for re-use.

Once the deploy is finished, you can:
- Test a file upload locally through your CLI using an `invoke_conversion` command we expose through Modal's [`local_entrypoint`](https://modal.com/docs/reference/modal.App#local_entrypoint)
- Get the URL of your endpoint and make a request through a client of your choice.

**Test from your CLI with `invoke_conversion`**

If your endpoint is live, simply run this command:

```
$ modal run marker_modal_deployment.py::invoke_conversion --pdf-file <PDF_FILE_PATH> --output-format markdown
```

And it'll automatically detect the URL of your new endpoint using [`.get_web_url()`](https://modal.com/docs/guide/webhook-urls#determine-the-url-of-a-web-endpoint-from-code), make sure it's healthy, submit your file, and store its output on your machine (in the same directory).

**Making a request using your own client**

If you want to make requests elsewhere e.g. with cURL or a client like Insomnia, you'll need to get the URL.

When your `modal deploy` command from earlier finishes, it'll include your endpoint URL at the end. For example:

```
$ modal deploy marker_modal_deployment.py
...
✓ Created objects.
├── 🔨 Created mount /marker/examples/marker_modal_deployment.py
├── 🔨 Created function download_models.
├── 🔨 Created function MarkerModalDemoService.*.
└── 🔨 Created web endpoint for MarkerModalDemoService.fastapi_app => <YOUR_ENDPOINT_URL>
✓ App deployed in 149.877s! 🎉
```

If you accidentally close your terminal session, you can also always go into Modal's dashboard and:
  - Find the app (default name: `datalab-marker-modal-demo`)
  - Click on `MarkerModalDemoService`
  - Find your endpoint URL

Once you have your URL, make a request to `{YOUR_ENDPOINT_URL}/convert` like this (you can also use Insomnia, etc.):
```
curl --request POST \
  --url {BASE_URL}/convert \
  --header 'Content-Type: multipart/form-data' \
  --form file=@/Users/cooldev/sample.pdf \
  --form output_format=html
  ```

You should get a response like this

```
{
	"success": true,
	"filename": "sample.pdf",
	"output_format": "html",
	"json": null,
	"html": "<YOUR_RESPONSE_CONTENT>",
	"markdown": null,
	"images": {},
	"metadata": {... page level metadata ...},
	"page_count": 2
}
```

[Modal](https://modal.com) makes deploying and scaling models and inference workloads much easier.

If you're interested in Datalab's managed API or on-prem document intelligence solution, check out [our platform here](https://datalab.to/?utm_source=gh-marker).

```

--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------

```markdown
# Marker

Marker converts documents to markdown, JSON, chunks, and HTML quickly and accurately.

- Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
- Formats tables, forms, equations, inline math, links, references, and code blocks
- Extracts and saves images
- Removes headers/footers/other artifacts
- Extensible with your own formatting and logic
- Does structured extraction, given a JSON schema (beta)
- Optionally boost accuracy with LLMs (and your own prompt)
- Works on GPU, CPU, or MPS

For our managed API or on-prem document intelligence solution, check out [our platform here](https://datalab.to?utm_source=gh-marker).

## Performance

<img src="data/images/overall.png" width="800px"/>

Marker benchmarks favorably compared to cloud services like Llamaparse and Mathpix, as well as other open source tools.

The above results are running single PDF pages serially.  Marker is significantly faster when running in batch mode, with a projected throughput of 25 pages/second on an H100.

See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.

## Hybrid Mode

For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker.  This will do things like merge tables across pages, handle inline math, format tables properly, and extract values from forms.  It can use any gemini or ollama model.  By default, it uses `gemini-2.0-flash`.  See [below](#llm-services) for details.

Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm:

<img src="data/images/table.png" width="400px"/>

As you can see, the use_llm mode offers higher accuracy than marker or gemini alone.

## Examples

| PDF | File type | Markdown                                                                                                                     | JSON                                                                                                   |
|-----|-----------|------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------|
| [Think Python](https://greenteapress.com/thinkpython/thinkpython.pdf) | Textbook | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/thinkpython/thinkpython.md)                 | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/thinkpython.json)         |
| [Switch Transformers](https://arxiv.org/pdf/2101.03961.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/switch_transformers/switch_trans.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/switch_trans.json) |
| [Multi-column CNN](https://arxiv.org/pdf/1804.07821.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/multicolcnn/multicolcnn.md)                 | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/multicolcnn.json)         |

# Commercial usage

Our model weights use a modified AI Pubs Open Rail-M license (free for research, personal use, and startups under $2M funding/revenue) and our code is GPL. For broader commercial licensing or to remove GPL requirements, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-marker).

# Hosted API & On-prem

There's a [hosted API](https://www.datalab.to?utm_source=gh-marker) and [painless on-prem solution](https://www.datalab.to/blog/self-serve-on-prem-licensing) for marker - it's free to sign up, and we'll throw in credits for you to test it out.

The API:
- Supports PDF, image, PPT, PPTX, DOC, DOCX, XLS, XLSX, HTML, EPUB files
- Is 1/4th the price of leading cloud-based competitors
- Fast - ~15s for a 250 page PDF
- Supports LLM mode
- High uptime (99.99%)

# Community

[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.

# Installation

You'll need python 3.10+ and [PyTorch](https://pytorch.org/get-started/locally/).

Install with:

```shell
pip install marker-pdf
```

If you want to use marker on documents other than PDFs, you will need to install additional dependencies with:

```shell
pip install marker-pdf[full]
```

# Usage

First, some configuration:

- Your torch device will be automatically detected, but you can override this.  For example, `TORCH_DEVICE=cuda`.
- Some PDFs, even digital ones, have bad text in them.  Set `--force_ocr` to force OCR on all lines, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text.
- If you care about inline math, set `force_ocr` to convert inline math to LaTeX.

## Interactive App

I've included a streamlit app that lets you interactively try marker with some basic options.  Run it with:

```shell
pip install streamlit streamlit-ace
marker_gui
```

## Convert a single file

```shell
marker_single /path/to/file.pdf
```

You can pass in PDFs or images.

Options:
- `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
- `--output_format [markdown|json|html|chunks]`: Specify the format for the output results.
- `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
- `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n`
- `--use_llm`: Uses an LLM to improve accuracy.  You will need to configure the LLM backend - see [below](#llm-services).
- `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text.  This will also format inline math properly.
- `--block_correction_prompt`: if LLM mode is active, an optional prompt that will be used to correct the output of marker.  This is useful for custom formatting or logic that you want to apply to the output.
- `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya.
- `--redo_inline_math`: If you want the absolute highest quality inline math conversion, use this along with `--use_llm`.
- `--disable_image_extraction`: Don't extract images from the PDF.  If you also specify `--use_llm`, then images will be replaced with a description.
- `--debug`: Enable debug mode for additional logging and diagnostic information.
- `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"`
- `--config_json PATH`: Path to a JSON configuration file containing additional settings.
- `config --help`: List all available builders, processors, and converters, and their associated configuration.  These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
- `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`.  The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables.
- `--llm_service`: Which llm service to use if `--use_llm` is passed.  This defaults to `marker.services.gemini.GoogleGeminiService`.
- `--help`: see all of the flags that can be passed into marker.  (it supports many more options then are listed above)

The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py).  If you don't need OCR, marker can work with any language.

## Convert multiple files

```shell
marker /path/to/input/folder
```

- `marker` supports all the same options from `marker_single` above.
- `--workers` is the number of conversion workers to run simultaneously.  This is automatically set by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage.  Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average.

## Convert multiple files on multiple GPUs

```shell
NUM_DEVICES=4 NUM_WORKERS=15 marker_chunk_convert ../pdf_in ../md_out
```

- `NUM_DEVICES` is the number of GPUs to use.  Should be `2` or greater.
- `NUM_WORKERS` is the number of parallel processes to run on each GPU.

## Use from python

See the `PdfConverter` class at `marker/converters/pdf.py` function for additional arguments that can be passed.

```python
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered

converter = PdfConverter(
    artifact_dict=create_model_dict(),
)
rendered = converter("FILEPATH")
text, _, images = text_from_rendered(rendered)
```

`rendered` will be a pydantic basemodel with different properties depending on the output type requested.  With markdown output (default), you'll have the properties `markdown`, `metadata`, and `images`.  For json output, you'll have `children`, `block_type`, and `metadata`.

### Custom configuration

You can pass configuration using the `ConfigParser`.  To see all available options, do `marker_single --help`.

```python
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser

config = {
    "output_format": "json",
    "ADDITIONAL_KEY": "VALUE"
}
config_parser = ConfigParser(config)

converter = PdfConverter(
    config=config_parser.generate_config_dict(),
    artifact_dict=create_model_dict(),
    processor_list=config_parser.get_processors(),
    renderer=config_parser.get_renderer(),
    llm_service=config_parser.get_llm_service()
)
rendered = converter("FILEPATH")
```

### Extract blocks

Each document consists of one or more pages.  Pages contain blocks, which can themselves contain other blocks.  It's possible to programmatically manipulate these blocks.

Here's an example of extracting all forms from a document:

```python
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.schema import BlockTypes

converter = PdfConverter(
    artifact_dict=create_model_dict(),
)
document = converter.build_document("FILEPATH")
forms = document.contained_blocks((BlockTypes.Form,))
```

Look at the processors for more examples of extracting and manipulating blocks.

## Other converters

You can also use other converters that define different conversion pipelines:

### Extract tables

The `TableConverter` will only convert and extract tables:

```python
from marker.converters.table import TableConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered

converter = TableConverter(
    artifact_dict=create_model_dict(),
)
rendered = converter("FILEPATH")
text, _, images = text_from_rendered(rendered)
```

This takes all the same configuration as the PdfConverter.  You can specify the configuration `force_layout_block=Table` to avoid layout detection and instead assume every page is a table.  Set `output_format=json` to also get cell bounding boxes.

You can also run this via the CLI with
```shell
marker_single FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter --output_format json
```

### OCR Only

If you only want to run OCR, you can also do that through the `OCRConverter`.  Set `--keep_chars` to keep individual characters and bounding boxes.

```python
from marker.converters.ocr import OCRConverter
from marker.models import create_model_dict

converter = OCRConverter(
    artifact_dict=create_model_dict(),
)
rendered = converter("FILEPATH")
```

This takes all the same configuration as the PdfConverter.

You can also run this via the CLI with
```shell
marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter
```

### Structured Extraction (beta)

You can run structured extraction via the `ExtractionConverter`.  This requires an llm service to be setup first (see [here](#llm-services) for details).  You'll get a JSON output with the extracted values.

```python
from marker.converters.extraction import ExtractionConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser
from pydantic import BaseModel

class Links(BaseModel):
    links: list[str]

schema = Links.model_json_schema()
config_parser = ConfigParser({
    "page_schema": schema
})

converter = ExtractionConverter(
    artifact_dict=create_model_dict(),
    config=config_parser.generate_config_dict(),
    llm_service=config_parser.get_llm_service(),
)
rendered = converter("FILEPATH")
```

Rendered will have an `original_markdown` field.  If you pass this back in next time you run the converter, as the `existing_markdown` config key, you can skip re-parsing the document.

# Output Formats

## Markdown

Markdown output will include:

- image links (images will be saved in the same folder)
- formatted tables
- embedded LaTeX equations (fenced with `$$`)
- Code is fenced with triple backticks
- Superscripts for footnotes

## HTML

HTML output is similar to markdown output:

- Images are included via `img` tags
- equations are fenced with `<math>` tags
- code is in `pre` tags

## JSON

JSON output will be organized in a tree-like structure, with the leaf nodes being blocks.  Examples of leaf nodes are a single list item, a paragraph of text, or an image.

The output will be a list, with each list item representing a page.  Each page is considered a block in the internal marker schema.  There are different types of blocks to represent different elements.

Pages have the keys:

- `id` - unique id for the block.
- `block_type` - the type of block. The possible block types can be seen in `marker/schema/__init__.py`.  As of this writing, they are ["Line", "Span", "FigureGroup", "TableGroup", "ListGroup", "PictureGroup", "Page", "Caption", "Code", "Figure", "Footnote", "Form", "Equation", "Handwriting", "TextInlineMath", "ListItem", "PageFooter", "PageHeader", "Picture", "SectionHeader", "Table", "Text", "TableOfContents", "Document"]
- `html` - the HTML for the page.  Note that this will have recursive references to children.  The `content-ref` tags must be replaced with the child content if you want the full html.  You can see an example of this at `marker/output.py:json_to_html`.  That function will take in a single block from the json output, and turn it into HTML.
- `polygon` - the 4-corner polygon of the page, in (x1,y1), (x2,y2), (x3, y3), (x4, y4) format.  (x1,y1) is the top left, and coordinates go clockwise.
- `children` - the child blocks.

The child blocks have two additional keys:

- `section_hierarchy` - indicates the sections that the block is part of.  `1` indicates an h1 tag, `2` an h2, and so on.
- `images` - base64 encoded images.  The key will be the block id, and the data will be the encoded image.

Note that child blocks of pages can have their own children as well (a tree structure).

```json
{
      "id": "/page/10/Page/366",
      "block_type": "Page",
      "html": "<content-ref src='/page/10/SectionHeader/0'></content-ref><content-ref src='/page/10/SectionHeader/1'></content-ref><content-ref src='/page/10/Text/2'></content-ref><content-ref src='/page/10/Text/3'></content-ref><content-ref src='/page/10/Figure/4'></content-ref><content-ref src='/page/10/SectionHeader/5'></content-ref><content-ref src='/page/10/SectionHeader/6'></content-ref><content-ref src='/page/10/TextInlineMath/7'></content-ref><content-ref src='/page/10/TextInlineMath/8'></content-ref><content-ref src='/page/10/Table/9'></content-ref><content-ref src='/page/10/SectionHeader/10'></content-ref><content-ref src='/page/10/Text/11'></content-ref>",
      "polygon": [[0.0, 0.0], [612.0, 0.0], [612.0, 792.0], [0.0, 792.0]],
      "children": [
        {
          "id": "/page/10/SectionHeader/0",
          "block_type": "SectionHeader",
          "html": "<h1>Supplementary Material for <i>Subspace Adversarial Training</i> </h1>",
          "polygon": [
            [217.845703125, 80.630859375], [374.73046875, 80.630859375],
            [374.73046875, 107.0],
            [217.845703125, 107.0]
          ],
          "children": null,
          "section_hierarchy": {
            "1": "/page/10/SectionHeader/1"
          },
          "images": {}
        },
        ...
        ]
    }


```

## Chunks

Chunks format is similar to JSON, but flattens everything into a single list instead of a tree.  Only the top level blocks from each page show up. It also has the full HTML of each block inside, so you don't need to crawl the tree to reconstruct it.  This enable flexible and easy chunking for RAG.

## Metadata

All output formats will return a metadata dictionary, with the following fields:

```json
{
    "table_of_contents": [
      {
        "title": "Introduction",
        "heading_level": 1,
        "page_id": 0,
        "polygon": [...]
      }
    ], // computed PDF table of contents
    "page_stats": [
      {
        "page_id":  0,
        "text_extraction_method": "pdftext",
        "block_counts": [("Span", 200), ...]
      },
      ...
    ]
}
```

# LLM Services

When running with the `--use_llm` flag, you have a choice of services you can use:

- `Gemini` - this will use the Gemini developer API by default.  You'll need to pass `--gemini_api_key` to configuration.
- `Google Vertex` - this will use vertex, which can be more reliable.  You'll need to pass `--vertex_project_id`.  To use it, set `--llm_service=marker.services.vertex.GoogleVertexService`.
- `Ollama` - this will use local models.  You can configure `--ollama_base_url` and `--ollama_model`. To use it, set `--llm_service=marker.services.ollama.OllamaService`.
- `Claude` - this will use the anthropic API.  You can configure `--claude_api_key`, and `--claude_model_name`.  To use it, set `--llm_service=marker.services.claude.ClaudeService`.
- `OpenAI` - this supports any openai-like endpoint. You can configure `--openai_api_key`, `--openai_model`, and `--openai_base_url`. To use it, set `--llm_service=marker.services.openai.OpenAIService`.
- `Azure OpenAI` - this uses the Azure OpenAI service. You can configure `--azure_endpoint`, `--azure_api_key`, and `--deployment_name`. To use it, set `--llm_service=marker.services.azure_openai.AzureOpenAIService`.

These services may have additional optional configuration as well - you can see it by viewing the classes.

# Internals

Marker is easy to extend.  The core units of marker are:

- `Providers`, at `marker/providers`.  These provide information from a source file, like a PDF.
- `Builders`, at `marker/builders`.  These generate the initial document blocks and fill in text, using info from the providers.
- `Processors`, at `marker/processors`.  These process specific blocks, for example the table formatter is a processor.
- `Renderers`, at `marker/renderers`. These use the blocks to render output.
- `Schema`, at `marker/schema`.  The classes for all the block types.
- `Converters`, at `marker/converters`.  They run the whole end to end pipeline.

To customize processing behavior, override the `processors`.  To add new output formats, write a new `renderer`.  For additional input formats, write a new `provider.`

Processors and renderers can be directly passed into the base `PDFConverter`, so you can specify your own custom processing easily.

## API server

There is a very simple API server you can run like this:

```shell
pip install -U uvicorn fastapi python-multipart
marker_server --port 8001
```

This will start a fastapi server that you can access at `localhost:8001`.  You can go to `localhost:8001/docs` to see the endpoint options.

You can send requests like this:

```
import requests
import json

post_data = {
    'filepath': 'FILEPATH',
    # Add other params here
}

requests.post("http://localhost:8001/marker", data=json.dumps(post_data)).json()
```

Note that this is not a very robust API, and is only intended for small-scale use.  If you want to use this server, but want a more robust conversion option, you can use the hosted [Datalab API](https://www.datalab.to/plans).

# Troubleshooting

There are some settings that you may find useful if things aren't working the way you expect:

- If you have issues with accuracy, try setting `--use_llm` to use an LLM to improve quality.  You must set `GOOGLE_API_KEY` to a Gemini API key for this to work.
- Make sure to set `force_ocr` if you see garbled text - this will re-OCR the document.
- `TORCH_DEVICE` - set this to force marker to use a given torch device for inference.
- If you're getting out of memory errors, decrease worker count.  You can also try splitting up long PDFs into multiple files.

## Debugging

Pass the `debug` option to activate debug mode.  This will save images of each page with detected layout and text, as well as output a json file with additional bounding box information.

# Benchmarks

## Overall PDF Conversion

We created a [benchmark set](https://huggingface.co/datasets/datalab-to/marker_benchmark) by extracting single PDF pages from common crawl.  We scored based on a heuristic that aligns text with ground truth text segments, and an LLM as a judge scoring method.

| Method     | Avg Time | Heuristic Score | LLM Score |
|------------|----------|-----------------|-----------|
| marker     | 2.83837  | 95.6709         | 4.23916   |
| llamaparse | 23.348   | 84.2442         | 3.97619   |
| mathpix    | 6.36223  | 86.4281         | 4.15626   |
| docling    | 3.69949  | 86.7073         | 3.70429   |

Benchmarks were run on an H100 for markjer and docling - llamaparse and mathpix used their cloud services.  We can also look at it by document type:

<img src="data/images/per_doc.png" width="1000px"/>

| Document Type        | Marker heuristic | Marker LLM | Llamaparse Heuristic | Llamaparse LLM | Mathpix Heuristic | Mathpix LLM | Docling Heuristic | Docling LLM |
|----------------------|------------------|------------|----------------------|----------------|-------------------|-------------|-------------------|-------------|
| Scientific paper     | 96.6737          | 4.34899    | 87.1651              | 3.96421        | 91.2267           | 4.46861     | 92.135            | 3.72422     |
| Book page            | 97.1846          | 4.16168    | 90.9532              | 4.07186        | 93.8886           | 4.35329     | 90.0556           | 3.64671     |
| Other                | 95.1632          | 4.25076    | 81.1385              | 4.01835        | 79.6231           | 4.00306     | 83.8223           | 3.76147     |
| Form                 | 88.0147          | 3.84663    | 66.3081              | 3.68712        | 64.7512           | 3.33129     | 68.3857           | 3.40491     |
| Presentation         | 95.1562          | 4.13669    | 81.2261              | 4              | 83.6737           | 3.95683     | 84.8405           | 3.86331     |
| Financial document   | 95.3697          | 4.39106    | 82.5812              | 4.16111        | 81.3115           | 4.05556     | 86.3882           | 3.8         |
| Letter               | 98.4021          | 4.5        | 93.4477              | 4.28125        | 96.0383           | 4.45312     | 92.0952           | 4.09375     |
| Engineering document | 93.9244          | 4.04412    | 77.4854              | 3.72059        | 80.3319           | 3.88235     | 79.6807           | 3.42647     |
| Legal document       | 96.689           | 4.27759    | 86.9769              | 3.87584        | 91.601            | 4.20805     | 87.8383           | 3.65552     |
| Newspaper page       | 98.8733          | 4.25806    | 84.7492              | 3.90323        | 96.9963           | 4.45161     | 92.6496           | 3.51613     |
| Magazine page        | 98.2145          | 4.38776    | 87.2902              | 3.97959        | 93.5934           | 4.16327     | 93.0892           | 4.02041     |

## Throughput

We benchmarked throughput using a [single long PDF](https://www.greenteapress.com/thinkpython/thinkpython.pdf).

| Method  | Time per page | Time per document | VRAM used |
|---------|---------------|-------------------|---------- |
| marker  | 0.18          | 43.42             |  3.17GB   |

The projected throughput is 122 pages per second on an H100 - we can run 22 individual processes given the VRAM used.

## Table Conversion

Marker can extract tables from PDFs using `marker.converters.table.TableConverter`. The table extraction performance is measured by comparing the extracted HTML representation of tables against the original HTML representations using the test split of [FinTabNet](https://developer.ibm.com/exchanges/data/all/fintabnet/). The HTML representations are compared using a tree edit distance based metric to judge both structure and content. Marker detects and identifies the structure of all tables in a PDF page and achieves these scores:

| Method           | Avg score | Total tables |
|------------------|-----------|--------------|
| marker           | 0.816     | 99           |
| marker w/use_llm | 0.907     | 99           |
| gemini           | 0.829     | 99           |

The `--use_llm` flag can significantly improve table recognition performance, as you can see.

We filter out tables that we cannot align with the ground truth, since fintabnet and our layout model have slightly different detection methods (this results in some tables being split/merged).

## Running your own benchmarks

You can benchmark the performance of marker on your machine. Install marker manually with:

```shell
git clone https://github.com/VikParuchuri/marker.git
poetry install
```

### Overall PDF Conversion

Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this:

```shell
python benchmarks/overall.py --methods marker --scores heuristic,llm
```

Options:

- `--use_llm` use an llm to improve the marker results.
- `--max_rows` how many rows to process for the benchmark.
- `--methods` can be `llamaparse`, `mathpix`, `docling`, `marker`.  Comma separated.
- `--scores` which scoring functions to use, can be `llm`, `heuristic`.  Comma separated.

### Table Conversion
The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with:

```shell
python benchmarks/table/table.py --max_rows 100
```

Options:

- `--use_llm` uses an llm with marker to improve accuracy.
- `--use_gemini` also benchmarks gemini 2.0 flash.

# How it works

Marker is a pipeline of deep learning models:

- Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya))
- Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya))
- Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify), [surya](https://github.com/VikParuchuri/surya))
- Optionally use an LLM to improve quality
- Combine blocks and postprocess complete text

It only uses models where necessary, which improves speed and accuracy.

# Limitations

PDF is a tricky format, so marker will not always work perfectly.  Here are some known limitations that are on the roadmap to address:

- Very complex layouts, with nested tables and forms, may not work
- Forms may not be rendered well

Note: Passing the `--use_llm` and `--force_ocr` flags will mostly solve these issues.

# Usage and Deployment Examples

You can always run `marker` locally, but if you wanted to expose it as an API, we have a few options:
- Our platform API which is powered by `marker` and `surya` and is easy to test out - it's free to sign up, and we'll include credits, [try it out here](https://datalab.to)
- Our painless on-prem solution for commercial use, which you can [read about here](https://www.datalab.to/blog/self-serve-on-prem-licensing) and gives you privacy guarantees with high throughput inference optimizations.
- [Deployment example with Modal](./examples/README_MODAL.md) that shows you how to deploy and access `marker` through a web endpoint using [`Modal`](https://modal.com). Modal is an AI compute platform that enables developers to deploy and scale models on GPUs in minutes.

```

--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------

```python

```

--------------------------------------------------------------------------------
/benchmarks/overall/__init__.py:
--------------------------------------------------------------------------------

```python

```

--------------------------------------------------------------------------------
/benchmarks/overall/display/__init__.py:
--------------------------------------------------------------------------------

```python

```

--------------------------------------------------------------------------------
/benchmarks/overall/download/__init__.py:
--------------------------------------------------------------------------------

```python

```

--------------------------------------------------------------------------------
/benchmarks/table/__init__.py:
--------------------------------------------------------------------------------

```python

```

--------------------------------------------------------------------------------
/benchmarks/throughput/__init__.py:
--------------------------------------------------------------------------------

```python

```

--------------------------------------------------------------------------------
/marker/config/__init__.py:
--------------------------------------------------------------------------------

```python

```

--------------------------------------------------------------------------------
/marker/scripts/__init__.py:
--------------------------------------------------------------------------------

```python

```

--------------------------------------------------------------------------------
/marker/utils/__init__.py:
--------------------------------------------------------------------------------

```python

```

--------------------------------------------------------------------------------
/marker/schema/groups/base.py:
--------------------------------------------------------------------------------

```python
from marker.schema.blocks import Block


class Group(Block):
    pass
```

--------------------------------------------------------------------------------
/marker/schema/text/__init__.py:
--------------------------------------------------------------------------------

```python
from marker.schema.text.line import Line
from marker.schema.text.span import Span

```

--------------------------------------------------------------------------------
/marker_server.py:
--------------------------------------------------------------------------------

```python
from marker.scripts.server import server_cli

if __name__ == "__main__":
    server_cli()

```

--------------------------------------------------------------------------------
/convert.py:
--------------------------------------------------------------------------------

```python
from marker.scripts.convert import convert_cli

if __name__ == "__main__":
    convert_cli()

```

--------------------------------------------------------------------------------
/chunk_convert.py:
--------------------------------------------------------------------------------

```python
from marker.scripts.chunk_convert import chunk_convert_cli

if __name__ == "__main__":
    chunk_convert_cli()
```

--------------------------------------------------------------------------------
/convert_single.py:
--------------------------------------------------------------------------------

```python
from marker.scripts.convert_single import convert_single_cli

if __name__ == "__main__":
    convert_single_cli()

```

--------------------------------------------------------------------------------
/marker_app.py:
--------------------------------------------------------------------------------

```python
from marker.scripts.run_streamlit_app import streamlit_app_cli

if __name__ == "__main__":
    streamlit_app_cli()
```

--------------------------------------------------------------------------------
/extraction_app.py:
--------------------------------------------------------------------------------

```python
from marker.scripts.run_streamlit_app import extraction_app_cli

if __name__ == "__main__":
    extraction_app_cli()

```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/schema.py:
--------------------------------------------------------------------------------

```python
from typing import TypedDict, List


class BenchmarkResult(TypedDict):
    markdown: str | List[str]
    time: float | None
```

--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------

```
[pytest]
testpaths=tests
markers =
    filename(name): specify the filename for the pdf_document fixture
filterwarnings =
    ignore::Warning
```

--------------------------------------------------------------------------------
/benchmarks/overall/scorers/schema.py:
--------------------------------------------------------------------------------

```python
from typing import TypedDict, List, Optional, Dict


class BlockScores(TypedDict):
    score: float
    specific_scores: Dict[str, float | List[float]]

```

--------------------------------------------------------------------------------
/marker/schema/blocks/toc.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks.basetable import BaseTable


class TableOfContents(BaseTable):
    block_type: str = BlockTypes.TableOfContents
    block_description: str = "A table of contents."

```

--------------------------------------------------------------------------------
/marker/schema/text/char.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block


class Char(Block):
    block_type: BlockTypes = BlockTypes.Char
    block_description: str = "A single character inside a span."

    text: str
    idx: int

```

--------------------------------------------------------------------------------
/marker/providers/utils.py:
--------------------------------------------------------------------------------

```python
def alphanum_ratio(text):
    text = text.replace(" ", "")
    text = text.replace("\n", "")
    alphanumeric_count = sum([1 for c in text if c.isalnum()])

    if len(text) == 0:
        return 1

    ratio = alphanumeric_count / len(text)
    return ratio

```

--------------------------------------------------------------------------------
/marker/schema/blocks/table.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks.basetable import BaseTable


class Table(BaseTable):
    block_type: BlockTypes = BlockTypes.Table
    block_description: str = "A table of data, like a results table.  It will be in a tabular format."

```

--------------------------------------------------------------------------------
/benchmarks/overall/scorers/__init__.py:
--------------------------------------------------------------------------------

```python
from typing import List

from benchmarks.overall.scorers.schema import BlockScores


class BaseScorer:
    def __init__(self):
        pass

    def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
        raise NotImplementedError()
```

--------------------------------------------------------------------------------
/tests/builders/test_structure.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.builders.structure import StructureBuilder


@pytest.mark.config({"page_range": [0]})
def test_structure_builder(pdf_document):
    structure = StructureBuilder()
    structure(pdf_document)
    assert len(pdf_document.pages[0].structure) > 0

```

--------------------------------------------------------------------------------
/marker/schema/groups/__init__.py:
--------------------------------------------------------------------------------

```python
from marker.schema.blocks.base import Block
from marker.schema.groups.figure import FigureGroup
from marker.schema.groups.table import TableGroup
from marker.schema.groups.list import ListGroup
from marker.schema.groups.picture import PictureGroup
from marker.schema.groups.page import PageGroup

```

--------------------------------------------------------------------------------
/marker/builders/__init__.py:
--------------------------------------------------------------------------------

```python
from typing import Optional

from pydantic import BaseModel

from marker.util import assign_config


class BaseBuilder:
    def __init__(self, config: Optional[BaseModel | dict] = None):
        assign_config(self, config)

    def __call__(self, data, *args, **kwargs):
        raise NotImplementedError

```

--------------------------------------------------------------------------------
/tests/builders/test_ocr_builder.py:
--------------------------------------------------------------------------------

```python
from PIL import Image

from marker.builders.ocr import OcrBuilder


def test_blank_char_builder(recognition_model):
    builder = OcrBuilder(recognition_model)
    image = Image.new("RGB", (100, 100))
    spans = builder.spans_from_html_chars([], None, image)  # Test with empty char list
    assert len(spans) == 0

```

--------------------------------------------------------------------------------
/marker/schema/blocks/form.py:
--------------------------------------------------------------------------------

```python
from typing import List

from marker.schema import BlockTypes
from marker.schema.blocks.basetable import BaseTable


class Form(BaseTable):
    block_type: BlockTypes = BlockTypes.Form
    block_description: str = "A form, such as a tax form, that contains fields and labels.  It most likely doesn't have a table structure."

```

--------------------------------------------------------------------------------
/tests/renderers/test_json_renderer.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.renderers.json import JSONRenderer


@pytest.mark.config({"page_range": [0]})
def test_markdown_renderer_pagination(pdf_document):
    renderer = JSONRenderer()
    pages = renderer(pdf_document).children

    assert len(pages) == 1
    assert pages[0].block_type == "Page"
    assert pages[0].children[0].block_type == "SectionHeader"
```

--------------------------------------------------------------------------------
/benchmarks/overall/schema.py:
--------------------------------------------------------------------------------

```python
from typing import TypedDict, List, Dict

from benchmarks.overall.scorers.schema import BlockScores

AVG_TYPE = Dict[str, Dict[str, Dict[str, List[float]]]]

class FullResult(TypedDict):
    scores: Dict[int, Dict[str, Dict[str, BlockScores]]]
    averages_by_type: AVG_TYPE
    averages_by_block_type: AVG_TYPE
    average_times: Dict[str, List[float]]
    markdown: Dict[int, Dict[str, str]]

```

--------------------------------------------------------------------------------
/tests/builders/test_strip_existing_ocr.py:
--------------------------------------------------------------------------------

```python
import pytest


@pytest.mark.config({"page_range": [0], "strip_existing_ocr": True})
@pytest.mark.filename("handwritten.pdf")
def test_strip_ocr(doc_provider):
    # Ensure that the OCR text isn't extracted
    assert len(doc_provider.page_lines) == 0


@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("handwritten.pdf")
def test_keep_ocr(doc_provider):
    assert len(doc_provider.page_lines) == 1

```

--------------------------------------------------------------------------------
/tests/renderers/test_html_renderer.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.renderers.html import HTMLRenderer


@pytest.mark.config(
    {
        "page_range": [0],
        "disable_ocr": True,
        "add_block_ids": True,
        "paginate_output": True,
    }
)
def test_html_renderer_block_ids(pdf_document, config):
    renderer = HTMLRenderer(config)
    html = renderer(pdf_document).html

    # Verify some block IDs are present
    assert "/page/0/Text/1" in html

```

--------------------------------------------------------------------------------
/tests/processors/test_document_toc_processor.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.processors.document_toc import DocumentTOCProcessor


@pytest.mark.config({"page_range": [0]})
def test_document_toc_processor(pdf_document, detection_model, recognition_model, table_rec_model):
    processor = DocumentTOCProcessor()
    processor(pdf_document)

    assert len(pdf_document.table_of_contents) == 4
    assert pdf_document.table_of_contents[0]["title"] == "Subspace Adversarial Training"

```

--------------------------------------------------------------------------------
/tests/processors/test_equation_processor.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.schema import BlockTypes
from marker.processors.equation import EquationProcessor


@pytest.mark.config({"page_range": [0]})
def test_equation_processor(pdf_document, recognition_model):
    processor = EquationProcessor(recognition_model)
    processor(pdf_document)

    for block in pdf_document.pages[0].children:
        if block.block_type == BlockTypes.Equation:
            assert block.html is not None
```

--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------

```python
from marker.providers.pdf import PdfProvider
import tempfile

import datasets


def setup_pdf_provider(
    filename='adversarial.pdf',
    config=None,
) -> PdfProvider:
    dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
    idx = dataset['filename'].index(filename)

    temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
    temp_pdf.write(dataset['pdf'][idx])
    temp_pdf.flush()

    provider = PdfProvider(temp_pdf.name, config)
    return provider

```

--------------------------------------------------------------------------------
/marker/schema/blocks/code.py:
--------------------------------------------------------------------------------

```python
import html

from marker.schema import BlockTypes
from marker.schema.blocks import Block


class Code(Block):
    block_type: BlockTypes = BlockTypes.Code
    code: str | None = None
    html: str | None = None
    block_description: str = "A programming code block."

    def assemble_html(self, document, child_blocks, parent_structure, block_config):
        if self.html:
            return self.html
        code = self.code or ""
        return f"<pre>{html.escape(code)}</pre>"

```

--------------------------------------------------------------------------------
/marker/processors/__init__.py:
--------------------------------------------------------------------------------

```python
from typing import Optional, Tuple

from pydantic import BaseModel

from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.util import assign_config


class BaseProcessor:
    block_types: Tuple[BlockTypes] | None = None  # What block types this processor is responsible for

    def __init__(self, config: Optional[BaseModel | dict] = None):
        assign_config(self, config)

    def __call__(self, document: Document, *args, **kwargs):
        raise NotImplementedError

```

--------------------------------------------------------------------------------
/tests/processors/test_footnote_processor.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.processors.footnote import FootnoteProcessor
from marker.schema import BlockTypes


@pytest.mark.filename("population_stats.pdf")
@pytest.mark.config({"page_range": [4]})
def test_footnote_processor(pdf_document):
    processor = FootnoteProcessor()
    processor(pdf_document)

    page0_footnotes = pdf_document.pages[0].contained_blocks(pdf_document, [BlockTypes.Footnote])
    assert len(page0_footnotes) >= 2

    assert page0_footnotes[-1].raw_text(pdf_document).strip().startswith("5")

```

--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------

```markdown
---
name: Feature request
about: Suggest an idea for this project
title: "[FEAT]"
labels: enhancement
assignees: ''

---

## ✨ Is your feature request related to a problem?

A clear and concise description of what the problem is. 

## 💡 Describe the Solution You'd Like

A concise description of what you want to happen or how you envision it working.

## 📋 Alternatives Considered

Any alternative solutions or workarounds you've tried.

## 🧩 Additional Context

Any additional context, references, or related issues.

```

--------------------------------------------------------------------------------
/marker/schema/blocks/reference.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block


class Reference(Block):
    block_type: BlockTypes = BlockTypes.Reference
    ref: str
    block_description: str = "A reference to this block from another block."

    def assemble_html(
        self, document, child_blocks, parent_structure=None, block_config=None
    ):
        template = super().assemble_html(
            document, child_blocks, parent_structure, block_config
        )
        return f"<span id='{self.ref}'>{template}</span>"

```

--------------------------------------------------------------------------------
/marker/utils/batch.py:
--------------------------------------------------------------------------------

```python
from marker.utils.gpu import GPUManager


def get_batch_sizes_worker_counts(gpu_manager: GPUManager, peak_worker_vram: int):
    vram = gpu_manager.get_gpu_vram()

    workers = max(1, vram // peak_worker_vram)
    if workers == 1:
        return {}, workers

    return {
        "layout_batch_size": 12,
        "detection_batch_size": 8,
        "table_rec_batch_size": 12,
        "ocr_error_batch_size": 12,
        "recognition_batch_size": 64,
        "equation_batch_size": 16,
        "detector_postprocessing_cpu_workers": 2,
    }, workers

```

--------------------------------------------------------------------------------
/tests/providers/test_pdf_provider.py:
--------------------------------------------------------------------------------

```python
import pytest


@pytest.mark.config({"page_range": [0]})
def test_pdf_provider(doc_provider):
    assert len(doc_provider) == 12
    assert doc_provider.get_images([0], 72)[0].size == (612, 792)
    assert doc_provider.get_images([0], 96)[0].size == (816, 1056)

    page_lines = doc_provider.get_page_lines(0)
    assert len(page_lines) == 85

    spans = page_lines[0].spans
    assert len(spans) == 2
    assert spans[0].text == "Subspace Adversarial Training"
    assert spans[0].font == "NimbusRomNo9L-Medi"
    assert spans[0].formats == ["plain"]

```

--------------------------------------------------------------------------------
/tests/providers/test_image_provider.py:
--------------------------------------------------------------------------------

```python
from marker.providers.image import ImageProvider
from marker.renderers.markdown import MarkdownOutput


def test_image_provider(config, temp_image):
    provider = ImageProvider(temp_image.name, config)
    assert len(provider) == 1
    assert provider.get_images([0], 72)[0].size == (512, 512)

    page_lines = provider.get_page_lines(0)
    assert len(page_lines) == 0

def test_image_provider_conversion(pdf_converter, temp_image):
    markdown_output: MarkdownOutput = pdf_converter(temp_image.name)
    assert "Hello, World!" in markdown_output.markdown



```

--------------------------------------------------------------------------------
/marker/schema/groups/picture.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.groups.base import Group


class PictureGroup(Group):
    block_type: BlockTypes = BlockTypes.PictureGroup
    block_description: str = "A picture along with associated captions."
    html: str | None = None

    def assemble_html(
        self, document, child_blocks, parent_structure, block_config=None
    ):
        if self.html:
            return self.html

        child_html = super().assemble_html(
            document, child_blocks, parent_structure, block_config
        )
        return child_html

```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/mathpix.py:
--------------------------------------------------------------------------------

```python
import datasets

from benchmarks.overall.methods import BaseMethod, BenchmarkResult


class MathpixMethod(BaseMethod):
    mathpix_ds: datasets.Dataset = None

    def __call__(self, sample) -> BenchmarkResult:
        uuid = sample["uuid"]
        data = None
        for row in self.mathpix_ds:
            if str(row["uuid"]) == str(uuid):
                data = row
                break
        if not data:
            raise ValueError(f"Could not find data for uuid {uuid}")

        return {
            "markdown": data["md"],
            "time": data["time"]
        }
```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/mistral.py:
--------------------------------------------------------------------------------

```python
import datasets

from benchmarks.overall.methods import BaseMethod, BenchmarkResult


class MistralMethod(BaseMethod):
    mistral_ds: datasets.Dataset = None

    def __call__(self, sample) -> BenchmarkResult:
        uuid = sample["uuid"]
        data = None
        for row in self.mistral_ds:
            if str(row["uuid"]) == str(uuid):
                data = row
                break
        if not data:
            raise ValueError(f"Could not find data for uuid {uuid}")

        return {
            "markdown": data["md"],
            "time": data["time"]
        }
```

--------------------------------------------------------------------------------
/marker/scripts/run_streamlit_app.py:
--------------------------------------------------------------------------------

```python
import subprocess
import os
import sys


def streamlit_app_cli(app_name: str = "streamlit_app.py"):
    argv = sys.argv[1:]
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    app_path = os.path.join(cur_dir, app_name)
    cmd = [
        "streamlit",
        "run",
        app_path,
        "--server.fileWatcherType",
        "none",
        "--server.headless",
        "true",
    ]
    if argv:
        cmd += ["--"] + argv
    subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"})


def extraction_app_cli():
    streamlit_app_cli("extraction_app.py")

```

--------------------------------------------------------------------------------
/marker/schema/groups/figure.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.groups.base import Group


class FigureGroup(Group):
    block_type: BlockTypes = BlockTypes.FigureGroup
    block_description: str = "A group that contains a figure and associated captions."
    html: str | None = None

    def assemble_html(
        self, document, child_blocks, parent_structure, block_config=None
    ):
        if self.html:
            return self.html

        child_html = super().assemble_html(
            document, child_blocks, parent_structure, block_config
        )
        return child_html

```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/llamaparse.py:
--------------------------------------------------------------------------------

```python
import datasets

from benchmarks.overall.methods import BaseMethod, BenchmarkResult


class LlamaParseMethod(BaseMethod):
    llamaparse_ds: datasets.Dataset = None

    def __call__(self, sample) -> BenchmarkResult:
        uuid = sample["uuid"]
        data = None
        for row in self.llamaparse_ds:
            if str(row["uuid"]) == str(uuid):
                data = row
                break
        if not data:
            raise ValueError(f"Could not find data for uuid {uuid}")

        return {
            "markdown": data["md"],
            "time": data["time"]
        }
```

--------------------------------------------------------------------------------
/marker/schema/blocks/handwriting.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block


class Handwriting(Block):
    block_type: BlockTypes = BlockTypes.Handwriting
    block_description: str = "A region that contains handwriting."
    html: str | None = None
    replace_output_newlines: bool = True

    def assemble_html(
        self, document, child_blocks, parent_structure, block_config=None
    ):
        if self.html:
            return self.html
        else:
            return super().assemble_html(
                document, child_blocks, parent_structure, block_config
            )

```

--------------------------------------------------------------------------------
/tests/schema/groups/test_list_grouping.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.builders.structure import StructureBuilder
from marker.schema import BlockTypes


@pytest.mark.config({"page_range": [4]})
@pytest.mark.skip(reason="Model breaks this up due to equations")
def test_list_grouping(pdf_document):
    structure = StructureBuilder()
    structure(pdf_document)

    page = pdf_document.pages[0]
    list_groups = []
    for block in page.children:
        if block.block_type == BlockTypes.ListGroup:
            list_groups.append(block)

    # The model breaks this up, since it has equations in it
    assert len(list_groups) == 3

```

--------------------------------------------------------------------------------
/tests/renderers/test_extract_images.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.renderers.markdown import MarkdownRenderer


@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("A17_FlightPlan.pdf")
def test_disable_extract_images(pdf_document):
    renderer = MarkdownRenderer({"extract_images": False})
    md = renderer(pdf_document).markdown

    # Verify markdown
    assert "jpeg" not in md


@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("A17_FlightPlan.pdf")
def test_extract_images(pdf_document):
    renderer = MarkdownRenderer()
    md = renderer(pdf_document).markdown

    # Verify markdown
    assert "jpeg" in md
```

--------------------------------------------------------------------------------
/marker/renderers/extraction.py:
--------------------------------------------------------------------------------

```python
from pydantic import BaseModel

from marker.extractors.document import DocumentExtractionSchema
from marker.renderers import BaseRenderer


class ExtractionOutput(BaseModel):
    analysis: str
    document_json: str
    original_markdown: str


class ExtractionRenderer(BaseRenderer):
    def __call__(
        self, output: DocumentExtractionSchema, markdown: str
    ) -> ExtractionOutput:
        # We definitely want to do more complex stuff here soon, so leave it in
        return ExtractionOutput(
            analysis=output.analysis,
            document_json=output.document_json,
            original_markdown=markdown,
        )

```

--------------------------------------------------------------------------------
/marker/scripts/chunk_convert.py:
--------------------------------------------------------------------------------

```python
import argparse
import os
import subprocess
import pkg_resources


def chunk_convert_cli():
    parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.")
    parser.add_argument("in_folder", help="Input folder with pdfs.")
    parser.add_argument("out_folder", help="Output folder")
    args = parser.parse_args()

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    script_path = os.path.join(cur_dir, "chunk_convert.sh")

    # Construct the command
    cmd = f"{script_path} {args.in_folder} {args.out_folder}"

    # Execute the shell script
    subprocess.run(cmd, shell=True, check=True)
```

--------------------------------------------------------------------------------
/tests/processors/test_ignoretext.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.processors.ignoretext import IgnoreTextProcessor
from marker.schema import BlockTypes


@pytest.mark.filename("bio_pdf.pdf")
@pytest.mark.config({"page_range": list(range(10))})
@pytest.mark.skip(reason="New layout model correctly identifies the block as a PageHeader, so nothing to be done by the IgnoreTextProcessor")
def test_ignoretext_processor(pdf_document):
    processor = IgnoreTextProcessor()
    processor(pdf_document)

    page1_header = pdf_document.pages[1].contained_blocks(pdf_document, [BlockTypes.Text])[0]
    assert "bioRxiv" in page1_header.raw_text(pdf_document)

    assert page1_header.ignore_for_output is True

```

--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------

```yaml
name: CI tests

on: [push]

jobs:
  tests:
    runs-on: t4_gpu
    steps:
      - uses: actions/checkout@v3
      - name: Install apt requirements
        run: |
          sudo apt-get update
          sudo apt-get install -y libpango-1.0-0 libharfbuzz0b libpangoft2-1.0-0 libgdk-pixbuf2.0-0 libcairo2 libffi-dev shared-mime-info
      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
          python-version: 3.11
      - name: Install python dependencies
        run: |
          pip install poetry
          poetry install --extras "full"
      - name: Run tests
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        run: poetry run pytest

```

--------------------------------------------------------------------------------
/marker/schema/blocks/footnote.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block


class Footnote(Block):
    block_type: BlockTypes = BlockTypes.Footnote
    block_description: str = (
        "A footnote that explains a term or concept in the document."
    )
    replace_output_newlines: bool = True
    html: str | None = None

    def assemble_html(
        self, document, child_blocks, parent_structure, block_config=None
    ):
        if self.html:
            return super().handle_html_output(
                document, child_blocks, parent_structure, block_config
            )

        return super().assemble_html(
            document, child_blocks, parent_structure, block_config
        )

```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/gt.py:
--------------------------------------------------------------------------------

```python
from typing import List
import json

from PIL import Image

from benchmarks.overall.methods import BaseMethod, BenchmarkResult


class GTMethod(BaseMethod):
    def __call__(self, sample) -> BenchmarkResult:
        gt_blocks = json.loads(sample["gt_blocks"])
        gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
        gt_markdown = [self.convert_to_md(block) for block in gt_html]
        return {
            "markdown": gt_markdown,
            "time": 0
        }

    def render(self, html: List[str]) -> Image.Image:
        joined = "\n\n".join(html)
        html = f"""
<html>
<head></head>
<body>
{joined}
</body>
</html>
""".strip()
        return self.html_to_image(html)
```

--------------------------------------------------------------------------------
/marker/schema/blocks/caption.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block


class Caption(Block):
    block_type: BlockTypes = BlockTypes.Caption
    block_description: str = "A text caption that is directly above or below an image or table. Only used for text describing the image or table.  "
    replace_output_newlines: bool = True
    html: str | None = None

    def assemble_html(self, document, child_blocks, parent_structure, block_config):
        if self.html:
            return super().handle_html_output(
                document, child_blocks, parent_structure, block_config
            )

        return super().assemble_html(
            document, child_blocks, parent_structure, block_config
        )

```

--------------------------------------------------------------------------------
/marker/schema/__init__.py:
--------------------------------------------------------------------------------

```python
from enum import auto, Enum


class BlockTypes(str, Enum):
    Line = auto()
    Span = auto()
    Char = auto()
    FigureGroup = auto()
    TableGroup = auto()
    ListGroup = auto()
    PictureGroup = auto()
    Page = auto()
    Caption = auto()
    Code = auto()
    Figure = auto()
    Footnote = auto()
    Form = auto()
    Equation = auto()
    Handwriting = auto()
    TextInlineMath = auto()
    ListItem = auto()
    PageFooter = auto()
    PageHeader = auto()
    Picture = auto()
    SectionHeader = auto()
    Table = auto()
    Text = auto()
    TableOfContents = auto()
    Document = auto()
    ComplexRegion = auto()
    TableCell = auto()
    Reference = auto()

    def __str__(self):
        return self.name

```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/docling.py:
--------------------------------------------------------------------------------

```python
import tempfile
import time

from benchmarks.overall.methods import BaseMethod, BenchmarkResult


class DoclingMethod(BaseMethod):
    model_dict: dict = None
    use_llm: bool = False

    def __call__(self, sample) -> BenchmarkResult:
        from docling.document_converter import DocumentConverter
        pdf_bytes = sample["pdf"]  # This is a single page PDF
        converter = DocumentConverter()

        with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
            f.write(pdf_bytes)
            start = time.time()
            result = converter.convert(f.name)
            total = time.time() - start

        return {
            "markdown": result.document.export_to_markdown(),
            "time": total
        }


```

--------------------------------------------------------------------------------
/marker/schema/blocks/pagefooter.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block


class PageFooter(Block):
    block_type: str = BlockTypes.PageFooter
    block_description: str = (
        "Text that appears at the bottom of a page, like a page number."
    )
    replace_output_newlines: bool = True
    ignore_for_output: bool = True
    html: str | None = None

    def assemble_html(self, document, child_blocks, parent_structure, block_config):
        if block_config and block_config.get("keep_pagefooter_in_output"):
            self.ignore_for_output = False

        if self.html and not self.ignore_for_output:
            return self.html

        return super().assemble_html(
            document, child_blocks, parent_structure, block_config
        )

```

--------------------------------------------------------------------------------
/marker/schema/blocks/pageheader.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block


class PageHeader(Block):
    block_type: BlockTypes = BlockTypes.PageHeader
    block_description: str = (
        "Text that appears at the top of a page, like a page title."
    )
    replace_output_newlines: bool = True
    ignore_for_output: bool = True
    html: str | None = None

    def assemble_html(self, document, child_blocks, parent_structure, block_config):
        if block_config and block_config.get("keep_pageheader_in_output"):
            self.ignore_for_output = False

        if self.html and not self.ignore_for_output:
            return self.html

        return super().assemble_html(
            document, child_blocks, parent_structure, block_config
        )

```

--------------------------------------------------------------------------------
/marker/schema/groups/table.py:
--------------------------------------------------------------------------------

```python
from typing import List

from marker.schema import BlockTypes
from marker.schema.blocks import BlockOutput
from marker.schema.groups.base import Group


class TableGroup(Group):
    block_type: BlockTypes = BlockTypes.TableGroup
    block_description: str = "A table along with associated captions."
    html: str | None = None

    def assemble_html(
        self,
        document,
        child_blocks: List[BlockOutput],
        parent_structure=None,
        block_config: dict | None = None,
    ):
        if self.html:
            return self.handle_html_output(
                document, child_blocks, parent_structure, block_config
            )

        return super().assemble_html(
            document, child_blocks, parent_structure, block_config
        )

```

--------------------------------------------------------------------------------
/tests/builders/test_rotated_bboxes.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.schema import BlockTypes


@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("adversarial_rot.pdf")
def test_rotated_bboxes(pdf_document):
    first_page = pdf_document.pages[0]

    # Ensure we match all text lines up properly
    text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
    text_blocks = first_page.contained_blocks(
        pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath)
    )
    # assert len(text_lines) == 84

    # Ensure the bbox sizes match up
    max_line_position = max([line.polygon.x_end for line in text_lines])
    max_block_position = max(
        [block.polygon.x_end for block in text_blocks if block.source == "layout"]
    )
    assert max_line_position <= max_block_position

```

--------------------------------------------------------------------------------
/marker/processors/document_toc.py:
--------------------------------------------------------------------------------

```python
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document


class DocumentTOCProcessor(BaseProcessor):
    """
    A processor for generating a table of contents for the document.
    """
    block_types = (BlockTypes.SectionHeader, )

    def __call__(self, document: Document):
        toc = []
        for page in document.pages:
            for block in page.contained_blocks(document, self.block_types):
                toc.append({
                    "title": block.raw_text(document).strip(),
                    "heading_level": block.heading_level,
                    "page_id": page.page_id,
                    "polygon": block.polygon.polygon
                })
        document.table_of_contents = toc

```

--------------------------------------------------------------------------------
/benchmarks/overall/registry.py:
--------------------------------------------------------------------------------

```python
from benchmarks.overall.methods.docling import DoclingMethod
from benchmarks.overall.methods.gt import GTMethod
from benchmarks.overall.methods.llamaparse import LlamaParseMethod
from benchmarks.overall.methods.marker import MarkerMethod
from benchmarks.overall.methods.mathpix import MathpixMethod
from benchmarks.overall.methods.mistral import MistralMethod
from benchmarks.overall.methods.olmocr import OlmOCRMethod
from benchmarks.overall.scorers.heuristic import HeuristicScorer
from benchmarks.overall.scorers.llm import LLMScorer

SCORE_REGISTRY = {
    "heuristic": HeuristicScorer,
    "llm": LLMScorer
}

METHOD_REGISTRY = {
    "marker": MarkerMethod,
    "gt": GTMethod,
    "mathpix": MathpixMethod,
    "llamaparse": LlamaParseMethod,
    "docling": DoclingMethod,
    "olmocr": OlmOCRMethod,
    "mistral": MistralMethod
}
```

--------------------------------------------------------------------------------
/marker/processors/page_header.py:
--------------------------------------------------------------------------------

```python
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.schema.groups.page import PageGroup


class PageHeaderProcessor(BaseProcessor):
    """
    A processor for moving PageHeaders to the top
    """
    block_types = (BlockTypes.PageHeader,)

    def __call__(self, document: Document):
        for page in document.pages:
            self.move_page_header_to_top(page, document)

    def move_page_header_to_top(self, page: PageGroup, document: Document):
        page_header_blocks = page.contained_blocks(document, self.block_types)
        page_header_block_ids = [block.id for block in page_header_blocks]
        for block_id in page_header_block_ids:
            page.structure.remove(block_id)
        page.structure[:0] = page_header_block_ids


```

--------------------------------------------------------------------------------
/marker/schema/groups/list.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.groups.base import Group


class ListGroup(Group):
    block_type: BlockTypes = BlockTypes.ListGroup
    has_continuation: bool = False
    block_description: str = "A group of list items that should be rendered together."
    html: str | None = None

    def assemble_html(
        self, document, child_blocks, parent_structure, block_config=None
    ):
        if self.html:
            return self.handle_html_output(
                document, child_blocks, parent_structure, block_config
            )

        template = super().assemble_html(
            document, child_blocks, parent_structure, block_config
        )

        el_attr = f" block-type='{self.block_type}'"
        if self.has_continuation:
            el_attr += " class='has-continuation'"
        return f"<p{el_attr}><ul>{template}</ul></p>"

```

--------------------------------------------------------------------------------
/benchmarks/overall/download/main.py:
--------------------------------------------------------------------------------

```python
import click

from benchmarks.overall.download.llamaparse import LlamaParseDownloader
from benchmarks.overall.download.mathpix import MathpixDownloader
from benchmarks.overall.download.mistral import MistralDownloader


@click.command("Download data from inference services")
@click.argument("service", type=click.Choice(["mathpix", "llamaparse", "mistral"]))
@click.option("--max_rows", type=int, default=2200)
@click.option("--api_key", type=str, default=None)
@click.option("--app_id", type=str, default=None)
def main(service: str, max_rows: int, api_key: str, app_id: str):
    registry = {
        "mathpix": MathpixDownloader,
        "llamaparse": LlamaParseDownloader,
        "mistral": MistralDownloader,
    }
    downloader = registry[service](api_key, app_id, max_rows=max_rows)

    # Generate data and upload to hub
    downloader()

if __name__ == "__main__":
    main()

```

--------------------------------------------------------------------------------
/marker/logger.py:
--------------------------------------------------------------------------------

```python
import logging
import warnings

from marker.settings import settings


def configure_logging():
    # Setup marker logger
    logger = get_logger()

    if not logger.handlers:
        handler = logging.StreamHandler()
        formatter = logging.Formatter(
            "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
        )
        handler.setFormatter(formatter)
        logger.addHandler(handler)

    logger.setLevel(settings.LOGLEVEL)

    # Ignore future warnings
    warnings.simplefilter(action="ignore", category=FutureWarning)

    # Set component loglevels
    logging.getLogger("PIL").setLevel(logging.ERROR)
    logging.getLogger("fontTools.subset").setLevel(logging.ERROR)
    logging.getLogger("fontTools.ttLib.ttFont").setLevel(logging.ERROR)
    logging.getLogger("weasyprint").setLevel(logging.CRITICAL)


def get_logger():
    return logging.getLogger("marker")

```

--------------------------------------------------------------------------------
/tests/processors/test_table_merge.py:
--------------------------------------------------------------------------------

```python
from unittest.mock import Mock

import pytest

from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
from marker.processors.table import TableProcessor
from marker.schema import BlockTypes


@pytest.mark.filename("table_ex2.pdf")
def test_llm_table_processor_nomerge(pdf_document, table_rec_model, recognition_model, detection_model, mocker):
    mock_cls = Mock()
    mock_cls.return_value = {
        "merge": "true",
        "direction": "right"
    }

    cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model)
    cell_processor(pdf_document)

    tables = pdf_document.contained_blocks((BlockTypes.Table,))
    assert len(tables) == 3

    processor = LLMTableMergeProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"})
    processor(pdf_document)

    tables = pdf_document.contained_blocks((BlockTypes.Table,))
    assert len(tables) == 3
```

--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------

```yaml
name: Python package
on:
  push:
    tags:
      - "v*.*.*"
jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
          python-version: 3.11
      - name: Install python dependencies
        run: |
          pip install poetry
          poetry install --extras "full"
      - name: Build package
        run: |
          poetry build
      - name: Extract version from pyproject.toml
        id: version
        run: |
          VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['tool']['poetry']['version'])")
          echo "version=v$VERSION" >> $GITHUB_OUTPUT
      - name: Publish package
        env:
          PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
        run: |
          poetry config pypi-token.pypi "$PYPI_TOKEN"
          poetry publish

```

--------------------------------------------------------------------------------
/marker/schema/blocks/sectionheader.py:
--------------------------------------------------------------------------------

```python
from typing import Optional

from marker.schema import BlockTypes
from marker.schema.blocks import Block


class SectionHeader(Block):
    block_type: BlockTypes = BlockTypes.SectionHeader
    heading_level: Optional[int] = None
    block_description: str = "The header of a section of text or other blocks."
    html: str | None = None

    def assemble_html(
        self, document, child_blocks, parent_structure, block_config=None
    ):
        if self.ignore_for_output:
            return ""

        if self.html:
            return super().handle_html_output(
                document, child_blocks, parent_structure, block_config
            )

        template = super().assemble_html(
            document, child_blocks, parent_structure, block_config
        )
        template = template.replace("\n", " ")
        tag = f"h{self.heading_level}" if self.heading_level else "h2"
        return f"<{tag}>{template}</{tag}>"

```

--------------------------------------------------------------------------------
/tests/renderers/test_chunk_renderer.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.renderers.chunk import ChunkRenderer


@pytest.mark.config({"page_range": [0]})
def test_chunk_renderer(pdf_document):
    renderer = ChunkRenderer()
    chunk_output = renderer(pdf_document)
    blocks = chunk_output.blocks
    page_info = chunk_output.page_info

    assert len(blocks) == 14
    assert blocks[0].block_type == "SectionHeader"
    assert page_info[0]["bbox"] is not None
    assert page_info[0]["polygon"] is not None

    figure_groups = [block for block in blocks if block.block_type == "FigureGroup"]
    figures = [block for block in blocks if block.block_type == "Figure"]
    captions = [block for block in blocks if block.block_type == "Caption"]

    assert len(figure_groups) == 1
    assert len(figures) == 0
    assert len(captions) == 0

    figure_group = figure_groups[0]
    assert figure_group.images is not None
    assert len(figure_group.images) == 1
    assert "<img src='/page/0/Figure/9'>" in figure_group.html
```

--------------------------------------------------------------------------------
/marker/providers/html.py:
--------------------------------------------------------------------------------

```python
import os
import tempfile

from marker.providers.pdf import PdfProvider


class HTMLProvider(PdfProvider):
    def __init__(self, filepath: str, config=None):
        temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        self.temp_pdf_path = temp_pdf.name
        temp_pdf.close()

        # Convert HTML to PDF
        try:
            self.convert_html_to_pdf(filepath)
        except Exception as e:
            raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")

        # Initialize the PDF provider with the temp pdf path
        super().__init__(self.temp_pdf_path, config)

    def __del__(self):
        if os.path.exists(self.temp_pdf_path):
            os.remove(self.temp_pdf_path)

    def convert_html_to_pdf(self, filepath: str):
        from weasyprint import HTML

        font_css = self.get_font_css()
        HTML(filename=filepath, encoding="utf-8").write_pdf(
            self.temp_pdf_path, stylesheets=[font_css]
        )

```

--------------------------------------------------------------------------------
/data/latex_to_md.sh:
--------------------------------------------------------------------------------

```bash
#!/bin/bash

# List all .tex files in the latex folder
FILES=$(find latex -name "*.tex")

for f in $FILES
do
  echo "Processing $f file..."
  base_name=$(basename "$f" .tex)
  out_file="references/${base_name}.md"

 pandoc --wrap=none \
         --no-highlight \
         --strip-comments \
         --from=latex \
         --to=commonmark_x+pipe_tables \
         "$f" \
         -o "$out_file"
  # Replace non-breaking spaces
  sed -i .bak 's/ / /g' "$out_file"
  sed -i .bak 's/ / /g' "$out_file"
  sed -i .bak 's/ / /g' "$out_file"
  sed -i .bak 's/ / /g' "$out_file"
  sed -i.bak -E 's/`\\cite`//g; s/<[^>]*>//g; s/\{[^}]*\}//g; s/\\cite\{[^}]*\}//g' "$out_file"
    sed -i.bak -E '
    s/`\\cite`//g;   # Remove \cite commands inside backticks
    s/::: //g;       # Remove the leading ::: for content markers
    s/\[//g;         # Remove opening square bracket
    s/\]//g;         # Remove closing square bracket
  ' "$out_file"
  # Remove .bak file
  rm "$out_file.bak"
done


```

--------------------------------------------------------------------------------
/marker/schema/blocks/figure.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block


class Figure(Block):
    block_type: BlockTypes = BlockTypes.Figure
    description: str | None = None
    html: str | None = None
    block_description: str = "A chart or other image that contains data."

    def assemble_html(
        self, document, child_blocks, parent_structure, block_config=None
    ):
        if self.html:
            return super().handle_html_output(
                document, child_blocks, parent_structure, block_config
            )

        child_ref_blocks = [
            block
            for block in child_blocks
            if block.id.block_type == BlockTypes.Reference
        ]
        html = super().assemble_html(
            document, child_ref_blocks, parent_structure, block_config
        )
        if self.description:
            html += f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>"
        return html

```

--------------------------------------------------------------------------------
/marker/schema/blocks/equation.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block


class Equation(Block):
    block_type: BlockTypes = BlockTypes.Equation
    html: str | None = None
    block_description: str = "A block math equation."

    def assemble_html(
        self, document, child_blocks, parent_structure=None, block_config=None
    ):
        if self.html:
            child_ref_blocks = [
                block
                for block in child_blocks
                if block.id.block_type == BlockTypes.Reference
            ]
            html_out = super().assemble_html(
                document, child_ref_blocks, parent_structure, block_config
            )
            html_out += f"""<p block-type='{self.block_type}'>{self.html}</p>"""
            return html_out
        else:
            template = super().assemble_html(
                document, child_blocks, parent_structure, block_config
            )
            return f"<p block-type='{self.block_type}'>{template}</p>"

```

--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/output-bug-report.md:
--------------------------------------------------------------------------------

```markdown
---
name: Output bug report
about: Create a report about poor output quality
title: "[BUG: Output]"
labels: 'bug: output'
assignees: ''

---

## 📝 Describe the Output Issue

A clear and concise description of the incorrect or unexpected output.

## 📄 Input Document

Attach the PDF or input file used.

## 📤 Current Output

Paste the Markdown or HTML that Marker generated:

````markdown
Paste output here
`````

## ✅ Expected Output

Describe or paste what you expected Marker to generate.

## ⚙️ Environment

Please fill in all relevant details:

* **Marker version**:
* **Surya version**:
* **Python version**:
* **PyTorch version**:
* **Transformers version**:
* **Operating System**:

## 📟 Command or Code Used

Paste the **exact bash command** or **Python code** you used to run Marker:

<details>
<summary>Click to expand</summary>

```bash
# or Python code block
your_command_here --with-flags
```

</details>

## 📎 Additional Context

Any other relevant info, configs, or assumptions.

```

--------------------------------------------------------------------------------
/tests/converters/test_table_converter.py:
--------------------------------------------------------------------------------

```python
import pytest
from marker.converters.table import TableConverter
from marker.renderers.markdown import MarkdownOutput
from marker.util import classes_to_strings

def _table_converter(config, model_dict, renderer, temp_pdf):
    converter = TableConverter(
        artifact_dict=model_dict,
        processor_list=None,
        renderer=classes_to_strings([renderer])[0],
        config=config
    )

    markdown_output: MarkdownOutput = converter(temp_pdf.name)
    markdown = markdown_output.markdown

    assert len(markdown) > 0
    assert "cyclic" in markdown


@pytest.mark.output_format("markdown")
@pytest.mark.config({"page_range": [5]})
def test_table_converter(config, model_dict, renderer, temp_doc):
    _table_converter(config, model_dict, renderer, temp_doc)

@pytest.mark.output_format("markdown")
@pytest.mark.config({"page_range": [5], "force_ocr": True})
def test_table_converter_ocr(config, model_dict, renderer, temp_doc):
    _table_converter(config, model_dict, renderer, temp_doc)


```

--------------------------------------------------------------------------------
/tests/builders/test_blank_page.py:
--------------------------------------------------------------------------------

```python
from surya.layout.schema import LayoutResult

from marker.builders.document import DocumentBuilder
from marker.builders.layout import LayoutBuilder
from marker.builders.line import LineBuilder


def test_blank_page(config, doc_provider, layout_model, ocr_error_model, detection_model):
    layout_builder = LayoutBuilder(layout_model, config)
    line_builder = LineBuilder(detection_model, ocr_error_model)
    builder = DocumentBuilder(config)
    document = builder.build_document(doc_provider)

    layout_results = [LayoutResult(
        bboxes=[],
        image_bbox=p.polygon.bbox,
    ) for p in document.pages]
    provider_lines = {p.page_id: [] for p in document.pages}
    ocr_lines = {p.page_id: [] for p in document.pages}

    layout_builder.add_blocks_to_pages(document.pages, layout_results)
    line_builder.merge_blocks(document, provider_lines, ocr_lines)

    assert all([isinstance(p.children, list) for p in document.pages])
    assert all([isinstance(p.structure, list) for p in document.pages])
```

--------------------------------------------------------------------------------
/marker/schema/blocks/complexregion.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block


class ComplexRegion(Block):
    block_type: BlockTypes = BlockTypes.ComplexRegion
    html: str | None = None
    block_description: str = "A complex region that can consist of multiple different types of blocks mixed with images. This block is chosen when it is difficult to categorize the region as a single block type."

    def assemble_html(self, document, child_blocks, parent_structure, block_config):
        if self.html:
            child_ref_blocks = [
                block
                for block in child_blocks
                if block.id.block_type == BlockTypes.Reference
            ]
            html = super().assemble_html(
                document, child_ref_blocks, parent_structure, block_config
            )
            return html + self.html
        else:
            template = super().assemble_html(
                document, child_blocks, parent_structure, block_config
            )
            return f"<p>{template}</p>"

```

--------------------------------------------------------------------------------
/marker/schema/blocks/picture.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block


class Picture(Block):
    block_type: BlockTypes = BlockTypes.Picture
    description: str | None = None
    block_description: str = "An image block that represents a picture."
    html: str | None = None

    def assemble_html(
        self, document, child_blocks, parent_structure, block_config=None
    ):
        if self.html:
            return super().handle_html_output(
                document, child_blocks, parent_structure, block_config
            )

        child_ref_blocks = [
            block
            for block in child_blocks
            if block.id.block_type == BlockTypes.Reference
        ]
        html = super().assemble_html(
            document, child_ref_blocks, parent_structure, block_config
        )

        if self.description:
            return (
                html
                + f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>"
            )
        return html

```

--------------------------------------------------------------------------------
/.github/workflows/benchmarks.yml:
--------------------------------------------------------------------------------

```yaml
name: Integration test

on: [push]

env:
  PYTHONIOENCODING: "utf-8"

jobs:
  benchmark:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [t4_gpu, ubuntu-latest]
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
          python-version: 3.11
      - name: Install apt dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y pandoc
      - name: Install python dependencies
        run: |
          pip install poetry
          poetry install --extras "full"
      - name: Run benchmark test
        run: |
          poetry run python benchmarks/overall/overall.py --max_rows 5
          poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/result.json --type marker
      - name: Run table benchmark
        run: |
          poetry run python benchmarks/table/table.py --max_rows 5
          poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table
```

--------------------------------------------------------------------------------
/marker/services/vertex.py:
--------------------------------------------------------------------------------

```python
from typing import Annotated

from google import genai

from marker.services.gemini import BaseGeminiService

class GoogleVertexService(BaseGeminiService):
    vertex_project_id: Annotated[
        str,
        "Google Cloud Project ID for Vertex AI.",
    ] = None
    vertex_location: Annotated[
        str,
        "Google Cloud Location for Vertex AI.",
    ] = "us-central1"
    gemini_model_name: Annotated[
        str,
        "The name of the Google model to use for the service."
    ] = "gemini-2.0-flash-001"
    vertex_dedicated: Annotated[
        bool,
        "Whether to use a dedicated Vertex AI instance."
    ] = False

    def get_google_client(self, timeout: int):
        http_options = {"timeout": timeout * 1000} # Convert to milliseconds
        if self.vertex_dedicated:
            http_options["headers"] = {"x-vertex-ai-llm-request-type": "dedicated"}
        return genai.Client(
            vertexai=True,
            project=self.vertex_project_id,
            location=self.vertex_location,
            http_options=http_options,
        )
```

--------------------------------------------------------------------------------
/marker/scripts/chunk_convert.sh:
--------------------------------------------------------------------------------

```bash
#!/bin/bash

trap 'pkill -P $$' SIGINT

# Check if NUM_DEVICES is set
if [[ -z "$NUM_DEVICES" ]]; then
    echo "Please set the NUM_DEVICES environment variable."
    exit 1
fi

if [[ -z "$NUM_WORKERS" ]]; then
    echo "Please set the NUM_WORKERS environment variable."
    exit 1
fi

# Get input folder and output folder from args
if [[ -z "$1" ]]; then
    echo "Please provide an input folder."
    exit 1
fi

if [[ -z "$2" ]]; then
    echo "Please provide an output folder."
    exit 1
fi

INPUT_FOLDER=$1
OUTPUT_FOLDER=$2

# Ensure output folder exists
mkdir -p "$OUTPUT_FOLDER"

# Loop from 0 to NUM_DEVICES and run the marker command in parallel
for (( i=0; i<$NUM_DEVICES; i++ )); do
    DEVICE_NUM=$i
    export DEVICE_NUM
    export NUM_DEVICES
    export NUM_WORKERS
    echo "Running marker on GPU $DEVICE_NUM"
    cmd="CUDA_VISIBLE_DEVICES=$DEVICE_NUM marker $INPUT_FOLDER --output_dir $OUTPUT_FOLDER --num_chunks $NUM_DEVICES --chunk_idx $DEVICE_NUM --workers $NUM_WORKERS"
    eval $cmd &

    sleep 5
done

# Wait for all background processes to finish
wait
```

--------------------------------------------------------------------------------
/marker/schema/blocks/__init__.py:
--------------------------------------------------------------------------------

```python
from __future__ import annotations

from marker.schema.blocks.base import Block, BlockId, BlockOutput
from marker.schema.blocks.caption import Caption
from marker.schema.blocks.code import Code
from marker.schema.blocks.figure import Figure
from marker.schema.blocks.footnote import Footnote
from marker.schema.blocks.form import Form
from marker.schema.blocks.equation import Equation
from marker.schema.blocks.handwriting import Handwriting
from marker.schema.blocks.inlinemath import InlineMath
from marker.schema.blocks.listitem import ListItem
from marker.schema.blocks.pagefooter import PageFooter
from marker.schema.blocks.pageheader import PageHeader
from marker.schema.blocks.picture import Picture
from marker.schema.blocks.sectionheader import SectionHeader
from marker.schema.blocks.table import Table
from marker.schema.blocks.text import Text
from marker.schema.blocks.toc import TableOfContents
from marker.schema.blocks.complexregion import ComplexRegion
from marker.schema.blocks.tablecell import TableCell
from marker.schema.blocks.reference import Reference

```

--------------------------------------------------------------------------------
/marker/schema/blocks/tablecell.py:
--------------------------------------------------------------------------------

```python
from typing import List

from marker.schema import BlockTypes
from marker.schema.blocks import Block


class TableCell(Block):
    block_type: BlockTypes = BlockTypes.TableCell
    rowspan: int
    colspan: int
    row_id: int
    col_id: int
    is_header: bool
    text_lines: List[str] | None = None
    block_description: str = "A cell in a table."

    @property
    def text(self):
        return "\n".join(self.text_lines)

    def assemble_html(
        self, document, child_blocks, parent_structure=None, block_config=None
    ):
        add_cell_id = block_config and block_config.get("add_block_ids", False)

        tag_cls = "th" if self.is_header else "td"
        tag = f"<{tag_cls}"
        if self.rowspan > 1:
            tag += f" rowspan={self.rowspan}"
        if self.colspan > 1:
            tag += f" colspan={self.colspan}"
        if add_cell_id:
            tag += f' data-block-id="{self.id}"'
        if self.text_lines is None:
            self.text_lines = []
        text = "<br>".join(self.text_lines)
        return f"{tag}>{text}</{tag_cls}>"

```

--------------------------------------------------------------------------------
/benchmarks/verify_scores.py:
--------------------------------------------------------------------------------

```python
import json
import argparse


def verify_scores(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)

    raw_scores = [data["scores"][k] for k in data["scores"]]
    marker_scores = [r["marker"]["heuristic"]["score"] for r in raw_scores]
    marker_score = sum(marker_scores) / len(marker_scores)
    if marker_score < 90:
        raise ValueError("Marker score below 90")


def verify_table_scores(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)

    avg = sum([r["marker_score"] for r in data["marker"]]) / len(data)
    if avg < 0.7:
        raise ValueError("Average score is below the required threshold of 0.7")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Verify benchmark scores")
    parser.add_argument("file_path", type=str, help="Path to the json file")
    parser.add_argument("--type", type=str, help="Type of file to verify", default="marker")
    args = parser.parse_args()
    if args.type == "marker":
        verify_scores(args.file_path)
    elif args.type == "table":
        verify_table_scores(args.file_path)

```

--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/breaking-bug-report.md:
--------------------------------------------------------------------------------

```markdown
---
name: Breaking bug report
about: Create a report about a breaking bug
title: "[BUG: Breaking]"
labels: 'bug: breaking'
assignees: ''

---

## 🧨 Describe the Bug

A clear and concise description of the breaking issue (e.g., crash, OOM, exception, etc).

## 📄 Input Document

Attach the PDF or input file that triggered the error.

## 📤 Output Trace / Stack Trace

Paste the **complete** stack trace or error output, if available.

<details>
<summary>Click to expand</summary>

```
Paste stack trace here
```

</details>

## ⚙️ Environment

Please fill in all relevant details:

- **Marker version**: 
- **Surya version**: 
- **Python version**: 
- **PyTorch version**: 
- **Transformers version**: 
- **Operating System** (incl. container info if relevant): 

## ✅ Expected Behavior

What did you expect Marker to do?

## 📟 Command or Code Used

Paste the **exact bash command** or **Python code** you used to run Marker:

<details>
<summary>Click to expand</summary>

```bash
# or Python code block
your_command_here --with-flags
```

</details>

## 📎 Additional Context

Any other context that might help us debug this (e.g., CLI options, working directory, runtime settings).

```

--------------------------------------------------------------------------------
/marker/models.py:
--------------------------------------------------------------------------------

```python
import os

os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = (
    "1"  # Transformers uses .isin for an op, which is not supported on MPS
)

from surya.foundation import FoundationPredictor
from surya.detection import DetectionPredictor
from surya.layout import LayoutPredictor
from surya.ocr_error import OCRErrorPredictor
from surya.recognition import RecognitionPredictor
from surya.table_rec import TableRecPredictor
from surya.settings import settings as surya_settings


def create_model_dict(
    device=None, dtype=None, attention_implementation: str | None = None
) -> dict:
    return {
        "layout_model": LayoutPredictor(FoundationPredictor(checkpoint=surya_settings.LAYOUT_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)),
        "recognition_model": RecognitionPredictor(FoundationPredictor(checkpoint=surya_settings.RECOGNITION_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)),
        "table_rec_model": TableRecPredictor(device=device, dtype=dtype),
        "detection_model": DetectionPredictor(device=device, dtype=dtype),
        "ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype),
    }

```

--------------------------------------------------------------------------------
/marker/extractors/__init__.py:
--------------------------------------------------------------------------------

```python
from typing import Annotated, Sequence

from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.schema.groups import PageGroup
from PIL import Image

from marker.services import BaseService
from marker.util import assign_config


class BaseExtractor:
    """
    An extractor that uses a provided service to extract structured data from documents.
    """

    max_concurrency: Annotated[
        int,
        "The maximum number of concurrent requests to make to the Gemini model.",
    ] = 3
    disable_tqdm: Annotated[
        bool,
        "Whether to disable the tqdm progress bar.",
    ] = False

    def __init__(self, llm_service: BaseService, config=None):
        assign_config(self, config)
        self.llm_service = llm_service

    def extract_image(
        self,
        document: Document,
        page: PageGroup,
        remove_blocks: Sequence[BlockTypes] | None = None,
        highres: bool = False,  # Default False to save tokens
    ) -> Image.Image:
        return page.get_image(
            document,
            highres=highres,
            remove_blocks=remove_blocks,
        )

    def __call__(self, document: Document, *args, **kwargs):
        raise NotImplementedError

```

--------------------------------------------------------------------------------
/marker/scripts/file_to_s3.py:
--------------------------------------------------------------------------------

```python
import json
import shutil
import datetime
from pathlib import Path
import boto3

from huggingface_hub import snapshot_download

import click

S3_API_URL = "https://1afbe4656a6b40d982ab5e730a39f6b9.r2.cloudflarestorage.com"

@click.command(help="Uploads files to an S3 bucket")
@click.argument("filepath", type=str)
@click.argument("s3_path", type=str)
@click.option("--bucket_name", type=str, default="datalab")
@click.option("--access_key_id", type=str, default="<access_key_id>")
@click.option("--access_key_secret", type=str, default="<access_key_secret>")
def main(filepath: str, s3_path: str, bucket_name: str, access_key_id: str, access_key_secret: str):
    filepath = Path(filepath)
    # Upload the files to S3
    s3_client = boto3.client(
        's3',
        endpoint_url=S3_API_URL,
        aws_access_key_id=access_key_id,
        aws_secret_access_key=access_key_secret,
        region_name="enam"
    )

    s3_key = f"{s3_path}/{filepath.name}"

    try:
        s3_client.upload_file(
            str(filepath),
            bucket_name,
            s3_key
        )
    except Exception as e:
        print(f"Error uploading {filepath}: {str(e)}")

    print(f"Uploaded files to {s3_path}")

if __name__ == "__main__":
    main()




```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/marker.py:
--------------------------------------------------------------------------------

```python
import os
import tempfile
import time

from benchmarks.overall.methods import BaseMethod, BenchmarkResult
from marker.config.parser import ConfigParser
from marker.converters.pdf import PdfConverter


class MarkerMethod(BaseMethod):
    model_dict: dict = None
    use_llm: bool = False

    def __call__(self, sample) -> BenchmarkResult:
        pdf_bytes = sample["pdf"]  # This is a single page PDF
        parser = ConfigParser({
                "page_range": "0",
                "disable_tqdm": True,
                "use_llm": self.use_llm,
                "redo_inline_math": self.use_llm,
                "llm_service": "marker.services.vertex.GoogleVertexService",
                "vertex_project_id": os.getenv("VERTEX_PROJECT_ID"),
            })

        block_converter = PdfConverter(
            artifact_dict=self.model_dict,
            config=parser.generate_config_dict(),
            llm_service=parser.get_llm_service()
        )

        with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
            f.write(pdf_bytes)
            start = time.time()
            rendered = block_converter(f.name)
            total = time.time() - start

        return {
            "markdown": rendered.markdown,
            "time": total
        }


```

--------------------------------------------------------------------------------
/marker/utils/image.py:
--------------------------------------------------------------------------------

```python
from PIL import Image
import numpy as np
import cv2
from typing import List, Optional

def is_blank_image(image: Image.Image, polygon: Optional[List[List[int]]] = None) -> bool:
    image = np.asarray(image)
    if (
        image is None
        or image.size == 0
        or image.shape[0] == 0
        or image.shape[1] == 0
    ):
        # Handle empty image case
        return True

    if polygon is not None:
        rounded_polys = [[int(corner[0]), int(corner[1])] for corner in polygon]
        if rounded_polys[0] == rounded_polys[1] and rounded_polys[2] == rounded_polys[3]:
            return True

    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    gray = cv2.GaussianBlur(gray, (7, 7), 0)

    # Adaptive threshold (inverse for text as white)
    binarized = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15
    )

    num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
        binarized, connectivity=8
    )
    cleaned = np.zeros_like(binarized)
    for i in range(1, num_labels):  # skip background
        cleaned[labels == i] = 255

    kernel = np.ones((1, 5), np.uint8)
    dilated = cv2.dilate(cleaned, kernel, iterations=3)
    b = dilated / 255
    return bool(b.sum() == 0)
```

--------------------------------------------------------------------------------
/marker/schema/blocks/text.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block


class Text(Block):
    block_type: BlockTypes = BlockTypes.Text
    has_continuation: bool = False
    blockquote: bool = False
    blockquote_level: int = 0
    html: str | None = None
    block_description: str = "A paragraph or line of text."

    def assemble_html(
        self, document, child_blocks, parent_structure, block_config=None
    ):
        if self.ignore_for_output:
            return ""

        # This happens when we used an llm processor
        if self.html:
            return super().handle_html_output(
                document, child_blocks, parent_structure, block_config
            )

        template = super().assemble_html(
            document, child_blocks, parent_structure, block_config
        )
        template = template.replace("\n", " ")

        el_attr = f" block-type='{self.block_type}'"
        if self.has_continuation:
            el_attr += " class='has-continuation'"

        if self.blockquote:
            blockquote_prefix = "<blockquote>" * self.blockquote_level
            blockquote_suffix = "</blockquote>" * self.blockquote_level
            return f"{blockquote_prefix}<p{el_attr}>{template}</p>{blockquote_suffix}"
        else:
            return f"<p{el_attr}>{template}</p>"

```

--------------------------------------------------------------------------------
/.github/workflows/cla.yml:
--------------------------------------------------------------------------------

```yaml
name: "Marker CLA Assistant"
on:
  issue_comment:
    types: [created]
  pull_request_target:
    types: [opened,closed,synchronize]

# explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings
permissions:
  actions: write
  contents: write
  pull-requests: write
  statuses: write

jobs:
  CLAAssistant:
    runs-on: ubuntu-latest
    steps:
      - name: "Marker CLA Assistant"
        if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target'
        uses: contributor-assistant/[email protected]
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          # the below token should have repo scope and must be manually added by you in the repository's secret
          # This token is required only if you have configured to store the signatures in a remote repository/organization
          PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
        with:
          path-to-signatures: 'signatures/version1/cla.json'
          path-to-document: 'https://github.com/VikParuchuri/marker/blob/master/CLA.md'
          # branch should not be protected
          branch: 'master'
          allowlist: VikParuchuri,Sandy
```

--------------------------------------------------------------------------------
/tests/builders/test_overriding.py:
--------------------------------------------------------------------------------

```python
import multiprocessing as mp

import pytest

from marker.providers.pdf import PdfProvider
from marker.schema import BlockTypes
from marker.schema.blocks import SectionHeader
from marker.schema.document import Document
from marker.schema.registry import register_block_class
from marker.schema.text import Line
from tests.utils import setup_pdf_provider


class NewSectionHeader(SectionHeader):
    pass


class NewLine(Line):
    pass


@pytest.mark.config({
    "page_range": [0],
    "override_map": {BlockTypes.SectionHeader: NewSectionHeader}
})
def test_overriding(pdf_document: Document):
    assert pdf_document.pages[0]\
        .get_block(pdf_document.pages[0].structure[0]).__class__ == NewSectionHeader


def get_lines(pdf: str, config=None):
    for block_type, block_cls in config["override_map"].items():
        register_block_class(block_type, block_cls)

    provider: PdfProvider = setup_pdf_provider(pdf, config)
    return provider.get_page_lines(0)


def test_overriding_mp():
    config = {
        "page_range": [0],
        "override_map": {BlockTypes.Line: NewLine}
    }

    pdf_list = ["adversarial.pdf", "adversarial_rot.pdf"]

    with mp.Pool(processes=2) as pool:
        results = pool.starmap(get_lines, [(pdf, config) for pdf in pdf_list])
        assert all([r[0].line.__class__ == NewLine for r in results])

```

--------------------------------------------------------------------------------
/.github/workflows/scripts.yml:
--------------------------------------------------------------------------------

```yaml
name: Test CLI scripts

on: [push]

jobs:
  tests:
    runs-on: t4_gpu
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python 3.11
        uses: actions/setup-python@v4
        with:
          python-version: 3.11
      - name: Install python dependencies
        run: |
          pip install poetry
          poetry install --extras "full"
      - name: Download benchmark data
        run: |
          wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
          unzip -o benchmark_data.zip
      - name: Test single script
        run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0
      - name: Test convert script
        run: poetry run marker benchmark_data/pdfs --max_files 1 --page_range 0
      - name: Text convert script multiple workers
        run: poetry run marker benchmark_data/pdfs --max_files 2 --page_range 0-5
      - name: Test llm option
        run: |
          poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 --use_llm > output.txt || echo "Command failed but continuing"
          if ! grep -q "UserWarning" output.txt; then
            echo "Success: No UserWarning found"
            exit 0
          else
            echo "Error: UserWarning found in output"
            exit 1
          fi
```

--------------------------------------------------------------------------------
/marker/processors/footnote.py:
--------------------------------------------------------------------------------

```python
import re

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.schema.groups import PageGroup


class FootnoteProcessor(BaseProcessor):
    """
    A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
    """
    block_types = (BlockTypes.Footnote,)

    def __call__(self, document: Document):
        for page in document.pages:
            self.push_footnotes_to_bottom(page, document)
            self.assign_superscripts(page, document)

    def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
        footnote_blocks = page.contained_blocks(document, self.block_types)

        # Push footnotes to the bottom
        for block in footnote_blocks:
            # Check if it is top-level
            if block.id in page.structure:
                # Move to bottom if it is
                page.structure.remove(block.id)
                page.add_structure(block)

    def assign_superscripts(self, page: PageGroup, document: Document):
        footnote_blocks = page.contained_blocks(document, self.block_types)

        for block in footnote_blocks:
            for span in block.contained_blocks(document, (BlockTypes.Span,)):
                if re.match(r"^[0-9\W]+", span.text):
                    span.has_superscript = True
                break

```

--------------------------------------------------------------------------------
/marker/scripts/convert_single.py:
--------------------------------------------------------------------------------

```python
import os

os.environ["GRPC_VERBOSITY"] = "ERROR"
os.environ["GLOG_minloglevel"] = "2"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = (
    "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
)

import time
import click

from marker.config.parser import ConfigParser
from marker.config.printer import CustomClickPrinter
from marker.logger import configure_logging, get_logger
from marker.models import create_model_dict
from marker.output import save_output

configure_logging()
logger = get_logger()


@click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.")
@click.argument("fpath", type=str)
@ConfigParser.common_options
def convert_single_cli(fpath: str, **kwargs):
    models = create_model_dict()
    start = time.time()
    config_parser = ConfigParser(kwargs)

    converter_cls = config_parser.get_converter_cls()
    converter = converter_cls(
        config=config_parser.generate_config_dict(),
        artifact_dict=models,
        processor_list=config_parser.get_processors(),
        renderer=config_parser.get_renderer(),
        llm_service=config_parser.get_llm_service(),
    )
    rendered = converter(fpath)
    out_folder = config_parser.get_output_folder(fpath)
    save_output(rendered, out_folder, config_parser.get_base_filename(fpath))

    logger.info(f"Saved markdown to {out_folder}")
    logger.info(f"Total time: {time.time() - start}")

```

--------------------------------------------------------------------------------
/marker/schema/blocks/inlinemath.py:
--------------------------------------------------------------------------------

```python
from marker.schema import BlockTypes
from marker.schema.blocks import Block


class InlineMath(Block):
    block_type: BlockTypes = BlockTypes.TextInlineMath
    has_continuation: bool = False
    blockquote: bool = False
    blockquote_level: int = 0
    block_description: str = "A text block that contains inline math.  This is not used for italic text or references - only for text that contains math."
    html: str | None = None

    def assemble_html(
        self, document, child_blocks, parent_structure, block_config=None
    ):
        if self.ignore_for_output:
            return ""

        if self.html:
            return super().handle_html_output(
                document, child_blocks, parent_structure, block_config
            )

        template = super().assemble_html(
            document, child_blocks, parent_structure, block_config
        )
        template = template.replace("\n", " ")

        el_attr = f" block-type='{self.block_type}'"
        if self.has_continuation:
            el_attr += " class='has-continuation'"

        if self.blockquote:
            # Add indentation for blockquote levels
            blockquote_prefix = "<blockquote>" * self.blockquote_level
            blockquote_suffix = "</blockquote>" * self.blockquote_level
            return f"{blockquote_prefix}<p{el_attr}>{template}</p>{blockquote_suffix}"
        else:
            return f"<p{el_attr}>{template}</p>"

```

--------------------------------------------------------------------------------
/tests/builders/test_layout_replace.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.builders.document import DocumentBuilder
from marker.builders.layout import LayoutBuilder
from marker.builders.line import LineBuilder
from marker.renderers.markdown import MarkdownRenderer
from marker.schema import BlockTypes
from marker.schema.registry import get_block_class


@pytest.mark.filename("thinkpython.pdf")
@pytest.mark.config({"page_range": [0]})
def test_layout_replace(
    request, config, doc_provider, layout_model, ocr_error_model, detection_model
):
    # The llm layout builder replaces blocks - this makes sure text is still merged properly
    layout_builder = LayoutBuilder(layout_model, config)
    line_builder = LineBuilder(detection_model, ocr_error_model, config)
    builder = DocumentBuilder(config)
    document = builder.build_document(doc_provider)
    layout_builder(document, doc_provider)
    page = document.pages[0]
    new_blocks = []
    for block in page.contained_blocks(document, (BlockTypes.Text,)):
        generated_block_class = get_block_class(BlockTypes.TextInlineMath)
        generated_block = generated_block_class(
            polygon=block.polygon,
            page_id=block.page_id,
            structure=block.structure,
        )
        page.replace_block(block, generated_block)
        new_blocks.append(generated_block)
    line_builder(document, doc_provider)

    for block in new_blocks:
        assert block.raw_text(document).strip()

    renderer = MarkdownRenderer(config)
    rendered = renderer(document)

    assert "Think Python" in rendered.markdown

```

--------------------------------------------------------------------------------
/marker/processors/code.py:
--------------------------------------------------------------------------------

```python
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Code
from marker.schema.document import Document


class CodeProcessor(BaseProcessor):
    """
    A processor for formatting code blocks.
    """
    block_types = (BlockTypes.Code, )

    def __call__(self, document: Document):
        for page in document.pages:
            for block in page.contained_blocks(document, self.block_types):
                self.format_block(document, block)


    def format_block(self, document: Document, block: Code):
        min_left = 9999  # will contain x- coord of column 0
        total_width = 0
        total_chars = 0
        
        contained_lines = block.contained_blocks(document, (BlockTypes.Line,))
        for line in contained_lines:
            min_left = min(line.polygon.bbox[0], min_left)
            total_width += line.polygon.width
            total_chars += len(line.raw_text(document))

        avg_char_width = total_width / max(total_chars, 1)
        code_text = ""
        is_new_line = False
        for line in contained_lines:
            text = line.raw_text(document)
            if avg_char_width == 0:
                prefix = ""
            else:
                total_spaces = int((line.polygon.bbox[0] - min_left) / avg_char_width)
                prefix = " " * max(0, total_spaces)

            if is_new_line:
                text = prefix + text

            code_text += text
            is_new_line = text.endswith("\n")

        block.code = code_text.rstrip()

```

--------------------------------------------------------------------------------
/marker/settings.py:
--------------------------------------------------------------------------------

```python
from typing import Optional

from dotenv import find_dotenv
from pydantic import computed_field
from pydantic_settings import BaseSettings
import torch
import os


class Settings(BaseSettings):
    # Paths
    BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results")
    FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
    DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")
    ARTIFACT_URL: str = "https://models.datalab.to/artifacts"
    FONT_NAME: str = "GoNotoCurrent-Regular.ttf"
    FONT_PATH: str = os.path.join(FONT_DIR, FONT_NAME)
    LOGLEVEL: str = "INFO"

    # General
    OUTPUT_ENCODING: str = "utf-8"
    OUTPUT_IMAGE_FORMAT: str = "JPEG"

    # LLM
    GOOGLE_API_KEY: Optional[str] = ""

    # General models
    TORCH_DEVICE: Optional[str] = (
        None  # Note: MPS device does not work for text detection, and will default to CPU
    )

    @computed_field
    @property
    def TORCH_DEVICE_MODEL(self) -> str:
        if self.TORCH_DEVICE is not None:
            return self.TORCH_DEVICE

        if torch.cuda.is_available():
            return "cuda"

        if torch.backends.mps.is_available():
            return "mps"

        return "cpu"

    @computed_field
    @property
    def MODEL_DTYPE(self) -> torch.dtype:
        if self.TORCH_DEVICE_MODEL == "cuda":
            return torch.bfloat16
        else:
            return torch.float32

    class Config:
        env_file = find_dotenv("local.env")
        extra = "ignore"


settings = Settings()

```

--------------------------------------------------------------------------------
/marker/converters/ocr.py:
--------------------------------------------------------------------------------

```python
from typing import Tuple

from marker.builders.document import DocumentBuilder
from marker.builders.line import LineBuilder
from marker.builders.ocr import OcrBuilder
from marker.converters.pdf import PdfConverter
from marker.processors import BaseProcessor
from marker.processors.equation import EquationProcessor
from marker.providers.registry import provider_from_filepath
from marker.renderers.ocr_json import OCRJSONRenderer


class OCRConverter(PdfConverter):
    default_processors: Tuple[BaseProcessor, ...] = (EquationProcessor,)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        if not self.config:
            self.config = {}

        self.config["force_ocr"] = True
        self.renderer = OCRJSONRenderer

    def build_document(self, filepath: str):
        provider_cls = provider_from_filepath(filepath)
        layout_builder = self.resolve_dependencies(self.layout_builder_class)
        line_builder = self.resolve_dependencies(LineBuilder)
        ocr_builder = self.resolve_dependencies(OcrBuilder)
        document_builder = DocumentBuilder(self.config)

        provider = provider_cls(filepath, self.config)
        document = document_builder(provider, layout_builder, line_builder, ocr_builder)

        for processor in self.processor_list:
            processor(document)

        return document

    def __call__(self, filepath: str):
        document = self.build_document(filepath)
        self.page_count = len(document.pages)
        renderer = self.resolve_dependencies(self.renderer)
        return renderer(document)

```

--------------------------------------------------------------------------------
/tests/builders/test_document_builder.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.schema import BlockTypes
from marker.schema.text.line import Line


@pytest.mark.filename("thinkpython.pdf")
@pytest.mark.config({"page_range": [0]})
def test_document_builder(pdf_document):
    first_page = pdf_document.pages[0]
    assert first_page.structure[0] == "/page/0/SectionHeader/0"

    first_block = first_page.get_block(first_page.structure[0])
    assert first_block.block_type == BlockTypes.SectionHeader
    assert first_block.text_extraction_method == "pdftext"

    first_text_block: Line = first_page.get_block(first_block.structure[0])
    assert first_text_block.block_type == BlockTypes.Line

    first_span = first_page.get_block(first_text_block.structure[0])
    assert first_span.block_type == BlockTypes.Span
    assert first_span.text == "Think Python"
    assert first_span.font == "URWPalladioL-Roma"
    assert first_span.formats == ["plain"]


@pytest.mark.config({"page_range": [0]})
def test_document_builder_inline_eq(pdf_document):
    first_page = pdf_document.pages[0]
    assert first_page.structure[0] == "/page/0/SectionHeader/0"

    first_block = first_page.get_block(first_page.structure[0])
    assert first_block.block_type == BlockTypes.SectionHeader
    assert first_block.text_extraction_method == "surya"

    first_text_block: Line = first_page.get_block(first_block.structure[0])
    assert first_text_block.block_type == BlockTypes.Line

    first_span = first_page.get_block(first_text_block.structure[0])
    assert first_span.block_type == BlockTypes.Span
    assert first_span.text.strip() == "Subspace Adversarial Training"
    assert "bold" in first_span.formats

```

--------------------------------------------------------------------------------
/marker/services/__init__.py:
--------------------------------------------------------------------------------

```python
from typing import Optional, List, Annotated
from io import BytesIO

import PIL
from pydantic import BaseModel

from marker.schema.blocks import Block
from marker.util import assign_config, verify_config_keys
import base64


class BaseService:
    timeout: Annotated[int, "The timeout to use for the service."] = 30
    max_retries: Annotated[
        int, "The maximum number of retries to use for the service."
    ] = 2
    retry_wait_time: Annotated[int, "The wait time between retries."] = 3
    max_output_tokens: Annotated[
        int, "The maximum number of output tokens to generate."
    ] = None

    def img_to_base64(self, img: PIL.Image.Image, format: str = "WEBP"):
        image_bytes = BytesIO()
        img.save(image_bytes, format=format)
        return base64.b64encode(image_bytes.getvalue()).decode("utf-8")

    def process_images(self, images: List[PIL.Image.Image]) -> list:
        raise NotImplementedError

    def format_image_for_llm(self, image):
        if not image:
            return []

        if not isinstance(image, list):
            image = [image]

        image_parts = self.process_images(image)
        return image_parts

    def __init__(self, config: Optional[BaseModel | dict] = None):
        assign_config(self, config)

        # Ensure we have all necessary fields filled out (API keys, etc.)
        verify_config_keys(self)

    def __call__(
        self,
        prompt: str,
        image: PIL.Image.Image | List[PIL.Image.Image] | None,
        block: Block | None,
        response_schema: type[BaseModel],
        max_retries: int | None = None,
        timeout: int | None = None,
    ):
        raise NotImplementedError

```

--------------------------------------------------------------------------------
/tests/providers/test_document_providers.py:
--------------------------------------------------------------------------------

```python
import pytest


@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("lambda.pptx")
def test_pptx_provider(doc_provider):
    assert doc_provider.get_images([0], 72)[0].size == (842, 596)

    page_lines = doc_provider.get_page_lines(0)

    spans = page_lines[0].spans
    assert spans[0].text == "Lambda Calculus"

    spans = page_lines[1].spans
    assert spans[0].text == "CSE 340 – Principles of Programming Languages"


@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("manual.epub")
def test_epub_provider(doc_provider):
    assert doc_provider.get_images([0], 72)[0].size == (596, 842)

    page_lines = doc_provider.get_page_lines(0)

    spans = page_lines[0].spans
    assert spans[0].text == "The Project Gutenberg eBook of Simple"


@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("china.html")
def test_html_provider(doc_provider):
    assert doc_provider.get_images([0], 72)[0].size == (596, 842)

    page_lines = doc_provider.get_page_lines(0)

    spans = page_lines[0].spans
    assert spans[0].text == "Jump to content"

@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("gatsby.docx")
def test_docx_provider(doc_provider):
    assert doc_provider.get_images([0], 72)[0].size == (596, 842)

    page_lines = doc_provider.get_page_lines(0)

    spans = page_lines[0].spans
    assert spans[0].text == "Themes"


@pytest.mark.config({"page_range": [0]})
@pytest.mark.filename("single_sheet.xlsx")
def test_xlsx_provider(doc_provider):
    assert doc_provider.get_images([0], 72)[0].size == (842, 596)

    page_lines = doc_provider.get_page_lines(0)

    spans = page_lines[0].spans
    assert spans[0].text == "Sheet1"
```

--------------------------------------------------------------------------------
/marker/schema/blocks/listitem.py:
--------------------------------------------------------------------------------

```python
import re

from marker.schema import BlockTypes
from marker.schema.blocks import Block


def replace_bullets(child_blocks):
    # Replace bullet characters with a -
    first_block = None
    while len(child_blocks) > 0:
        first_block = child_blocks[0]
        child_blocks = first_block.children

    if first_block is not None and first_block.id.block_type == BlockTypes.Line:
        bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○ഠ ം◦■▪▫–—-]( )"
        first_block.html = re.sub(bullet_pattern, r"\1\2", first_block.html)


class ListItem(Block):
    block_type: BlockTypes = BlockTypes.ListItem
    list_indent_level: int = 0
    block_description: str = "A list item that is part of a list.  This block is used to represent a single item in a list."
    html: str | None = None

    def assemble_html(
        self, document, child_blocks, parent_structure, block_config=None
    ):
        template = super().assemble_html(
            document, child_blocks, parent_structure, block_config
        )
        template = template.replace("\n", " ")
        # Remove the first bullet character
        replace_bullets(child_blocks)

        if self.html:
            template = (
                super()
                .handle_html_output(
                    document, child_blocks, parent_structure, block_config
                )
                .strip()
            )
            template = template.replace("<li>", "").replace("</li>", "")

        el_attr = f" block-type='{self.block_type}'"
        if self.list_indent_level:
            return f"<ul><li{el_attr} class='list-indent-{self.list_indent_level}'>{template}</li></ul>"
        return f"<li{el_attr}>{template}</li>"

```

--------------------------------------------------------------------------------
/marker/providers/image.py:
--------------------------------------------------------------------------------

```python
from typing import List, Annotated
from PIL import Image

from marker.providers import ProviderPageLines, BaseProvider
from marker.schema.polygon import PolygonBox
from marker.schema.text import Line
from pdftext.schema import Reference


class ImageProvider(BaseProvider):
    page_range: Annotated[
        List[int],
        "The range of pages to process.",
        "Default is None, which will process all pages.",
    ] = None

    image_count: int = 1

    def __init__(self, filepath: str, config=None):
        super().__init__(filepath, config)

        self.images = [Image.open(filepath)]
        self.page_lines: ProviderPageLines = {i: [] for i in range(self.image_count)}

        if self.page_range is None:
            self.page_range = range(self.image_count)

        assert max(self.page_range) < self.image_count and min(self.page_range) >= 0, (
            f"Invalid page range, values must be between 0 and {len(self.doc) - 1}.  Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}."
        )

        self.page_bboxes = {
            i: [0, 0, self.images[i].size[0], self.images[i].size[1]]
            for i in self.page_range
        }

    def __len__(self):
        return self.image_count

    def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
        return [self.images[i] for i in idxs]

    def get_page_bbox(self, idx: int) -> PolygonBox | None:
        bbox = self.page_bboxes[idx]
        if bbox:
            return PolygonBox.from_bbox(bbox)

    def get_page_lines(self, idx: int) -> List[Line]:
        return self.page_lines[idx]

    def get_page_refs(self, idx: int) -> List[Reference]:
        return []

```

--------------------------------------------------------------------------------
/tests/builders/test_ocr_pipeline.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.schema import BlockTypes
from marker.schema.text.line import Line


def _ocr_pipeline_test(pdf_document):
    first_page = pdf_document.pages[0]
    assert first_page.structure[0] == "/page/0/SectionHeader/0"

    first_block = first_page.get_block(first_page.structure[0])
    assert first_block.text_extraction_method == "surya"
    assert first_block.block_type == BlockTypes.SectionHeader

    first_text_block: Line = first_page.get_block(first_block.structure[0])
    assert first_text_block.block_type == BlockTypes.Line

    first_span = first_page.get_block(first_text_block.structure[0])
    assert first_span.block_type == BlockTypes.Span
    assert first_span.text.strip() == "Subspace Adversarial Training"

    # Ensure we match all text lines up properly
    # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
    text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
    text_blocks = first_page.contained_blocks(
        pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath)
    )
    # assert len(text_lines) == 83

    # Ensure the bbox sizes match up
    max_line_position = max([line.polygon.y_end for line in text_lines])
    max_block_position = max(
        [block.polygon.y_end for block in text_blocks if block.source == "layout"]
    )
    assert max_line_position <= (max_block_position * 1.02)


@pytest.mark.config({"force_ocr": True, "page_range": [0]})
def test_ocr_pipeline(pdf_document):
    _ocr_pipeline_test(pdf_document)


@pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True})
def test_ocr_with_inline_pipeline(pdf_document):
    _ocr_pipeline_test(pdf_document)

```

--------------------------------------------------------------------------------
/benchmarks/table/gemini.py:
--------------------------------------------------------------------------------

```python
import json
from PIL import Image
from google import genai
from google.genai import types
from io import BytesIO
from pydantic import BaseModel

from marker.settings import settings

prompt = """
You're an expert document analyst who is good at turning tables in documents into HTML.  Analyze the provided image, and convert it to a faithful HTML representation.
 
Guidelines:
- Keep the HTML simple and concise.
- Only include the <table> tag and contents.
- Only use <table>, <tr>, and <td> tags.  Only use the colspan and rowspan attributes if necessary.  Do not use <tbody>, <thead>, or <th> tags.
- Make sure the table is as faithful to the image as possible with the given tags.

**Instructions**
1. Analyze the image, and determine the table structure.
2. Convert the table image to HTML, following the guidelines above.
3. Output only the HTML for the table, starting with the <table> tag and ending with the </table> tag.
""".strip()

class TableSchema(BaseModel):
    table_html: str

def gemini_table_rec(image: Image.Image):
    client = genai.Client(
        api_key=settings.GOOGLE_API_KEY,
        http_options={"timeout": 60000}
    )

    image_bytes = BytesIO()
    image.save(image_bytes, format="PNG")

    responses = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[types.Part.from_bytes(data=image_bytes.getvalue(), mime_type="image/png"), prompt],  # According to gemini docs, it performs better if the image is the first element
        config={
            "temperature": 0,
            "response_schema": TableSchema,
            "response_mime_type": "application/json",
        },
    )

    output = responses.candidates[0].content.parts[0].text
    return json.loads(output)["table_html"]
```

--------------------------------------------------------------------------------
/tests/builders/test_pdf_links.py:
--------------------------------------------------------------------------------

```python
import re

import pytest

from marker.converters.pdf import PdfConverter
from marker.renderers.markdown import MarkdownOutput
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.util import classes_to_strings


@pytest.mark.filename("arxiv_test.pdf")
@pytest.mark.output_format("markdown")
@pytest.mark.config({"disable_ocr": True})
def test_pdf_links(pdf_document: Document, config, renderer, model_dict, temp_doc):
    first_page = pdf_document.pages[1]

    processors = ["marker.processors.reference.ReferenceProcessor"]
    pdf_converter = PdfConverter(
        artifact_dict=model_dict,
        processor_list=processors,
        renderer=classes_to_strings([renderer])[0],
        config=config,
    )

    for section_header_span in first_page.contained_blocks(
        pdf_document, (BlockTypes.Span,)
    ):
        if "II." in section_header_span.text:
            assert section_header_span.url == "#page-1-0"
            break
    else:
        raise ValueError("Could not find II. in the first page")

    section_header_block = first_page.contained_blocks(
        pdf_document, (BlockTypes.SectionHeader,)
    )[0]
    assert section_header_block.raw_text(pdf_document) == "II. THEORETICAL FRAMEWORK\n"

    assert first_page.refs[0].ref == "page-1-0"

    markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
    markdown = markdown_output.markdown

    assert "[II.](#page-1-0)" in markdown
    assert '<span id="page-1-0"></span>II. THEORETICAL FRAMEWORK' in markdown

    for ref in set(
        [
            f'<span id="page-{m[0]}-{m[1]}">'
            for m in re.findall(r"\]\(#page-(\d+)-(\d+)\)", markdown)
        ]
    ):
        assert ref in markdown, f"Reference {ref} not found in markdown"

```

--------------------------------------------------------------------------------
/benchmarks/overall/download/llamaparse.py:
--------------------------------------------------------------------------------

```python
import io
import time

import requests

from benchmarks.overall.download.base import Downloader


class LlamaParseDownloader(Downloader):
    service = "llamaparse"

    def get_html(self, pdf_bytes):
        rand_name = str(time.time()) + ".pdf"
        start = time.time()
        buff = io.BytesIO(pdf_bytes)
        md = upload_and_parse_file(self.api_key, rand_name, buff)
        end = time.time()
        if isinstance(md, bytes):
            md = md.decode("utf-8")

        return {
            "md": md,
            "time": end - start,
        }


def upload_and_parse_file(api_key: str, fname: str, buff, max_retries: int = 180, delay: int = 1):
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Accept": "application/json"
    }

    # Upload file
    files = {
        'file': (fname, buff, 'application/pdf')
    }
    response = requests.post(
        'https://api.cloud.llamaindex.ai/api/v1/parsing/upload',
        headers=headers,
        files=files
    )
    response.raise_for_status()
    job_id = response.json()['id']

    # Poll for completion
    for _ in range(max_retries):
        status_response = requests.get(
            f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}',
            headers=headers
        )
        status_response.raise_for_status()
        if status_response.json()['status'] == 'SUCCESS':
            # Get results
            result_response = requests.get(
                f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown',
                headers=headers
            )
            result_response.raise_for_status()
            return result_response.json()['markdown']

        time.sleep(delay)

    raise TimeoutError("Job did not complete within the maximum retry attempts")
```

--------------------------------------------------------------------------------
/benchmarks/overall/download/mistral.py:
--------------------------------------------------------------------------------

```python
import io
import time
import requests

from benchmarks.overall.download.base import Downloader


class MistralDownloader(Downloader):
    service = "mistral"

    def get_html(self, pdf_bytes):
        rand_name = str(time.time()) + ".pdf"
        start = time.time()
        buff = io.BytesIO(pdf_bytes)
        md = upload_and_process_file(self.api_key, rand_name, buff)
        end = time.time()
        if isinstance(md, bytes):
            md = md.decode("utf-8")

        return {
            "md": md,
            "time": end - start,
        }


def upload_and_process_file(api_key: str, fname: str, buff):
    headers = {
        "Authorization": f"Bearer {api_key}"
    }

    upload_headers = headers.copy()
    files = {
        'file': (fname, buff, 'application/pdf'),
        'purpose': (None, 'ocr')
    }

    upload_response = requests.post(
        'https://api.mistral.ai/v1/files',
        headers=upload_headers,
        files=files
    )
    upload_response.raise_for_status()
    file_id = upload_response.json()['id']

    url_headers = headers.copy()
    url_headers["Accept"] = "application/json"

    url_response = requests.get(
        f'https://api.mistral.ai/v1/files/{file_id}/url?expiry=24',
        headers=url_headers
    )
    url_response.raise_for_status()
    signed_url = url_response.json()['url']

    ocr_headers = headers.copy()
    ocr_headers["Content-Type"] = "application/json"

    ocr_data = {
        "model": "mistral-ocr-latest",
        "document": {
            "type": "document_url",
            "document_url": signed_url
        },
        "include_image_base64": True
    }
    ocr_response = requests.post(
        'https://api.mistral.ai/v1/ocr',
        headers=ocr_headers,
        json=ocr_data
    )
    ocr_response.raise_for_status()
    result = ocr_response.json()
    return result["pages"][0]["markdown"]
```

--------------------------------------------------------------------------------
/tests/config/test_config.py:
--------------------------------------------------------------------------------

```python
import sys
from contextlib import suppress
import click

from marker.config.printer import CustomClickPrinter
from marker.config.crawler import crawler
from marker.config.parser import ConfigParser


def capture_kwargs(argv):
    command = click.command(cls=CustomClickPrinter)
    captured_kwargs = {}

    def parse_args(**kwargs):
        captured_kwargs.update(kwargs)
        return kwargs

    original_argv = sys.argv
    sys.argv = argv
    try:
        with suppress(SystemExit):
            command(ConfigParser.common_options(parse_args))()
    finally:
        sys.argv = original_argv

    return captured_kwargs


def test_config_parser():
    sys.argv = [
        "test",
        "--disable_multiprocessing",
        "--output_dir",
        "output_dir",
        "--height_tolerance",
        "0.5",
    ]
    kwargs = capture_kwargs(sys.argv)
    parser = ConfigParser(kwargs)
    config_dict = parser.generate_config_dict()

    # Validate kwarg capturing
    assert kwargs["disable_multiprocessing"]
    assert kwargs["output_dir"] == "output_dir"

    assert config_dict["pdftext_workers"] == 1  # disabling multiprocessing does this
    assert config_dict["height_tolerance"] == 0.5


def test_config_none():
    kwargs = capture_kwargs(["test"])

    for key in crawler.attr_set:
        # We force some options to become flags for ease of use on the CLI
        value = None
        assert kwargs.get(key) is value


def test_config_llm():
    kwargs = capture_kwargs(["test", "--use_llm"])
    parser = ConfigParser(kwargs)
    config_dict = parser.generate_config_dict()

    # Validate kwarg capturing
    assert config_dict["use_llm"]


def test_config_force_ocr():
    kwargs = capture_kwargs(["test", "--force_ocr"])
    parser = ConfigParser(kwargs)
    config_dict = parser.generate_config_dict()

    # Validate kwarg capturing
    assert config_dict["force_ocr"]

```

--------------------------------------------------------------------------------
/tests/converters/test_ocr_converter.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.converters.ocr import OCRConverter
from marker.renderers.ocr_json import OCRJSONOutput, OCRJSONPageOutput


def _ocr_converter(config, model_dict, temp_pdf, line_count: int, eq_count: int):
    converter = OCRConverter(artifact_dict=model_dict, config=config)

    ocr_json: OCRJSONOutput = converter(temp_pdf.name)
    pages = ocr_json.children

    assert len(pages) == 1
    # assert len(pages[0].children) == line_count
    eqs = [line for line in pages[0].children if line.block_type == "Equation"]
    assert len(eqs) == eq_count
    return pages


def check_bboxes(page: OCRJSONPageOutput, lines):
    page_size = page.bbox
    for line in lines:
        assert len(line.children) > 0
        for child in line.children:
            bbox = child.bbox
            assert all(
                [
                    bbox[0] >= page_size[0],
                    bbox[1] >= page_size[1],
                    bbox[2] <= page_size[2],
                    bbox[3] <= page_size[3],
                ]
            ), "Child bbox is outside page bbox"


@pytest.mark.config({"page_range": [0]})
def test_ocr_converter(config, model_dict, temp_doc):
    _ocr_converter(config, model_dict, temp_doc, 85, 2)


@pytest.mark.filename("pres.pdf")
@pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True})
def test_ocr_converter_force(config, model_dict, temp_doc):
    pages = _ocr_converter(config, model_dict, temp_doc, 10, 0)
    lines = [line for line in pages[0].children if line.block_type == "Line"]
    check_bboxes(pages[0], lines)


@pytest.mark.filename("pres.pdf")
@pytest.mark.config({"page_range": [1], "keep_chars": True})
def test_ocr_converter_keep(config, model_dict, temp_doc):
    pages = _ocr_converter(config, model_dict, temp_doc, 10, 0)
    lines = [line for line in pages[0].children if line.block_type == "Line"]
    check_bboxes(pages[0], lines)

```

--------------------------------------------------------------------------------
/benchmarks/overall/display/dataset.py:
--------------------------------------------------------------------------------

```python
import json
from typing import List

import datasets
from tqdm import tqdm

from benchmarks.overall.registry import METHOD_REGISTRY
from benchmarks.overall.schema import FullResult


def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, score_types: List[str], max_rows: int | None = None) -> datasets.Dataset:
    rows = []
    for idx, sample in tqdm(enumerate(bench_dataset), desc="Building dataset"):
        if idx not in result["markdown"]:
            continue

        if max_rows is not None and idx >= max_rows:
            break

        row = {
            "uuid": sample["uuid"],
            "classification": sample["classification"],
            "language": sample["language"],
            "img": sample["img"],
        }
        for method in result["markdown"][idx]:
            if method == "gt":
                continue

            method_cls = METHOD_REGISTRY[method]()
            md = result["markdown"][idx][method]
            try:
                method_img = method_cls.render(result["markdown"][idx][method])
            except Exception as e:
                # This can happen when the markdown is None
                method_img = PIL.Image.new("RGB", (200, 200))

            row[f"{method}_md"] = md
            row[f"{method}_img"] = method_img

            for score_type in score_types:
                try:
                    row[f"{method}_{score_type}"] = result["scores"][idx][method][score_type]["score"]
                except KeyError:
                    row[f"{method}_{score_type}"] = -1.0 # Missing score
                try:
                    row[f"{method}_{score_type}_detail"] = json.dumps(result["scores"][idx][method][score_type]["specific_scores"])
                except KeyError:
                    row[f"{method}_{score_type}_detail"] = "" # Missing detail
        rows.append(row)
    ds = datasets.Dataset.from_list(rows)
    return ds


```

--------------------------------------------------------------------------------
/benchmarks/overall/download/base.py:
--------------------------------------------------------------------------------

```python
import json
from json import JSONDecodeError
from pathlib import Path

import datasets
from tqdm import tqdm


class Downloader:
    cache_path: Path = Path("cache")
    service: str

    def __init__(self, api_key, app_id, max_rows: int = 2200):
        self.cache_path.mkdir(exist_ok=True)
        self.max_rows = max_rows
        self.api_key = api_key
        self.app_id = app_id
        self.ds = datasets.load_dataset("datalab-to/marker_benchmark", split="train")

    def get_html(self, pdf_bytes):
        raise NotImplementedError

    def upload_ds(self):
        rows = []
        for file in self.cache_path.glob("*.json"):
            with open(file, "r") as f:
                data = json.load(f)
            rows.append(data)

        out_ds = datasets.Dataset.from_list(rows, features=datasets.Features({
            "md": datasets.Value("string"),
            "uuid": datasets.Value("string"),
            "time": datasets.Value("float"),
        }))
        out_ds.push_to_hub(f"datalab-to/marker_benchmark_{self.service}", private=True)

    def generate_data(self):
        max_rows = self.max_rows
        for idx, sample in tqdm(enumerate(self.ds), desc=f"Saving {self.service} results"):
            cache_file = self.cache_path / f"{idx}.json"
            if cache_file.exists():
                continue

            pdf_bytes = sample["pdf"]  # This is a single page PDF
            try:
                out_data = self.get_html(pdf_bytes)
            except JSONDecodeError as e:
                print(f"Error with sample {idx}: {e}")
                continue
            except Exception as e:
                print(f"Error with sample {idx}: {e}")
                continue
            out_data["uuid"] = sample["uuid"]

            with cache_file.open("w") as f:
                json.dump(out_data, f)

            if idx >= max_rows:
                break

    def __call__(self):
        self.generate_data()
        self.upload_ds()

```

--------------------------------------------------------------------------------
/marker/processors/reference.py:
--------------------------------------------------------------------------------

```python
import numpy as np

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Reference
from marker.schema.document import Document
from marker.schema.groups.list import ListGroup
from marker.schema.groups.table import TableGroup
from marker.schema.registry import get_block_class
from marker.schema.groups.figure import FigureGroup


class ReferenceProcessor(BaseProcessor):
    """
    A processor for adding references to the document.
    """

    def __init__(self, config):
        super().__init__(config)

    def __call__(self, document: Document):
        ReferenceClass: Reference = get_block_class(BlockTypes.Reference)

        for page in document.pages:
            refs = page.refs
            ref_starts = np.array([ref.coord for ref in refs])

            blocks = []
            for block_id in page.structure:
                block = page.get_block(block_id)
                if isinstance(block, (ListGroup, FigureGroup, TableGroup)):
                    blocks.extend([page.get_block(b) for b in block.structure])
                else:
                    blocks.append(block)
            blocks = [b for b in blocks if not b.ignore_for_output]

            block_starts = np.array([block.polygon.bbox[:2] for block in blocks])

            if not (len(refs) and len(block_starts)):
                continue

            distances = np.linalg.norm(block_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2)
            for ref_idx in range(len(ref_starts)):
                block_idx = np.argmin(distances[:, ref_idx])
                block = blocks[block_idx]

                ref_block = page.add_full_block(ReferenceClass(
                    ref=refs[ref_idx].ref,
                    polygon=block.polygon,
                    page_id=page.page_id
                ))
                if block.structure is None:
                    block.structure = []
                block.structure.insert(0, ref_block.id)

```

--------------------------------------------------------------------------------
/marker/builders/document.py:
--------------------------------------------------------------------------------

```python
from typing import Annotated

from marker.builders import BaseBuilder
from marker.builders.layout import LayoutBuilder
from marker.builders.line import LineBuilder
from marker.builders.ocr import OcrBuilder
from marker.providers.pdf import PdfProvider
from marker.schema import BlockTypes
from marker.schema.document import Document
from marker.schema.groups.page import PageGroup
from marker.schema.registry import get_block_class


class DocumentBuilder(BaseBuilder):
    """
    Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder.
    """
    lowres_image_dpi: Annotated[
        int,
        "DPI setting for low-resolution page images used for Layout and Line Detection.",
    ] = 96
    highres_image_dpi: Annotated[
        int,
        "DPI setting for high-resolution page images used for OCR.",
    ] = 192
    disable_ocr: Annotated[
        bool,
        "Disable OCR processing.",
    ] = False

    def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, line_builder: LineBuilder, ocr_builder: OcrBuilder):
        document = self.build_document(provider)
        layout_builder(document, provider)
        line_builder(document, provider)
        if not self.disable_ocr:
            ocr_builder(document, provider)
        return document

    def build_document(self, provider: PdfProvider):
        PageGroupClass: PageGroup = get_block_class(BlockTypes.Page)
        lowres_images = provider.get_images(provider.page_range, self.lowres_image_dpi)
        highres_images = provider.get_images(provider.page_range, self.highres_image_dpi)
        initial_pages = [
            PageGroupClass(
                page_id=p,
                lowres_image=lowres_images[i],
                highres_image=highres_images[i],
                polygon=provider.get_page_bbox(p),
                refs=provider.get_page_refs(p)
            ) for i, p in enumerate(provider.page_range)
        ]
        DocumentClass: Document = get_block_class(BlockTypes.Document)
        return DocumentClass(filepath=provider.filepath, pages=initial_pages)

```

--------------------------------------------------------------------------------
/marker/services/ollama.py:
--------------------------------------------------------------------------------

```python
import json
from typing import Annotated, List

import PIL
import requests
from marker.logger import get_logger
from pydantic import BaseModel

from marker.schema.blocks import Block
from marker.services import BaseService

logger = get_logger()


class OllamaService(BaseService):
    ollama_base_url: Annotated[
        str, "The base url to use for ollama.  No trailing slash."
    ] = "http://localhost:11434"
    ollama_model: Annotated[str, "The model name to use for ollama."] = (
        "llama3.2-vision"
    )

    def process_images(self, images):
        image_bytes = [self.img_to_base64(img) for img in images]
        return image_bytes

    def __call__(
        self,
        prompt: str,
        image: PIL.Image.Image | List[PIL.Image.Image] | None,
        block: Block | None,
        response_schema: type[BaseModel],
        max_retries: int | None = None,
        timeout: int | None = None,
    ):
        url = f"{self.ollama_base_url}/api/generate"
        headers = {"Content-Type": "application/json"}

        schema = response_schema.model_json_schema()
        format_schema = {
            "type": "object",
            "properties": schema["properties"],
            "required": schema["required"],
        }

        image_bytes = self.format_image_for_llm(image)

        payload = {
            "model": self.ollama_model,
            "prompt": prompt,
            "stream": False,
            "format": format_schema,
            "images": image_bytes,
        }

        try:
            response = requests.post(url, json=payload, headers=headers)
            response.raise_for_status()
            response_data = response.json()

            total_tokens = (
                response_data["prompt_eval_count"] + response_data["eval_count"]
            )

            if block:
                block.update_metadata(llm_request_count=1, llm_tokens_used=total_tokens)

            data = response_data["response"]
            return json.loads(data)
        except Exception as e:
            logger.warning(f"Ollama inference failed: {e}")

        return {}

```

--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------

```toml
[tool.poetry]
name = "marker-pdf"
version = "1.10.1"
description = "Convert documents to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
license = "GPL-3.0-or-later"
repository = "https://github.com/VikParuchuri/marker"
keywords = ["pdf", "markdown", "ocr", "nlp"]
packages = [
    {include = "marker"}
]
include = [
    "marker/scripts/*.sh",
    "marker/scripts/*.html",
]

[tool.poetry.dependencies]
python = "^3.10"
Pillow = "^10.1.0"
pydantic = "^2.4.2"
pydantic-settings = "^2.0.3"
transformers = "^4.45.2"
python-dotenv = "^1.0.0"
torch = "^2.7.0"
tqdm = "^4.66.1"
ftfy = "^6.1.1"
rapidfuzz = "^3.8.1"
surya-ocr = "^0.17.0"
regex = "^2024.4.28"
pdftext = "~0.6.3"
markdownify = "^1.1.0"
click = "^8.2.0"
markdown2 = "^2.5.2"
filetype = "^1.2.0"
google-genai = "^1.0.0"
anthropic = "^0.46.0"
pre-commit = "^4.2.0"
scikit-learn = "^1.6.1"

# Optional dependencies for documents
mammoth = {version = "^1.9.0", optional = true}
openpyxl = {version = "^3.1.5", optional = true}
python-pptx = {version = "^1.0.2", optional = true}
ebooklib = {version = "^0.18", optional = true}
weasyprint = {version = "^63.1", optional = true}
openai = "^1.65.2"

[tool.poetry.group.dev.dependencies]
jupyter = "^1.0.0"
datasets = "^2.21.0"
streamlit = "^1.37.1"
fastapi = "^0.115.4"
uvicorn = "^0.32.0"
python-multipart = "^0.0.16"
pytest = "^8.3.3"
pytest-mock = "^3.14.0"
apted = "1.0.3"
distance = "0.1.3"
lxml = "5.3.0"
tabulate = "^0.9.0"
latex2mathml = "^3.77.0"
playwright = "^1.49.1"

[tool.poetry.extras]
full = ["mammoth", "openpyxl", "python-pptx", "ebooklib", "weasyprint"]

[tool.poetry.scripts]
marker = "marker.scripts.convert:convert_cli"
marker_single = "marker.scripts.convert_single:convert_single_cli"
marker_chunk_convert = "marker.scripts.chunk_convert:chunk_convert_cli"
marker_gui = "marker.scripts.run_streamlit_app:streamlit_app_cli"
marker_extract = "marker.scripts.run_streamlit_app:extraction_app_cli"
marker_server = "marker.scripts.server:server_cli"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

```

--------------------------------------------------------------------------------
/marker/converters/table.py:
--------------------------------------------------------------------------------

```python
from typing import Tuple, List

from marker.builders.document import DocumentBuilder
from marker.builders.line import LineBuilder
from marker.builders.ocr import OcrBuilder
from marker.converters.pdf import PdfConverter
from marker.processors import BaseProcessor
from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
from marker.processors.llm.llm_form import LLMFormProcessor
from marker.processors.llm.llm_table import LLMTableProcessor
from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
from marker.processors.table import TableProcessor
from marker.providers.registry import provider_from_filepath
from marker.schema import BlockTypes


class TableConverter(PdfConverter):
    default_processors: Tuple[BaseProcessor, ...] = (
        TableProcessor,
        LLMTableProcessor,
        LLMTableMergeProcessor,
        LLMFormProcessor,
        LLMComplexRegionProcessor,
    )
    converter_block_types: List[BlockTypes] = (
        BlockTypes.Table,
        BlockTypes.Form,
        BlockTypes.TableOfContents,
    )

    def build_document(self, filepath: str):
        provider_cls = provider_from_filepath(filepath)
        layout_builder = self.resolve_dependencies(self.layout_builder_class)
        line_builder = self.resolve_dependencies(LineBuilder)
        ocr_builder = self.resolve_dependencies(OcrBuilder)
        document_builder = DocumentBuilder(self.config)
        document_builder.disable_ocr = True

        provider = provider_cls(filepath, self.config)
        document = document_builder(provider, layout_builder, line_builder, ocr_builder)

        for page in document.pages:
            page.structure = [
                p for p in page.structure if p.block_type in self.converter_block_types
            ]

        for processor in self.processor_list:
            processor(document)

        return document

    def __call__(self, filepath: str):
        document = self.build_document(filepath)
        self.page_count = len(document.pages)

        renderer = self.resolve_dependencies(self.renderer)
        return renderer(document)

```

--------------------------------------------------------------------------------
/marker/schema/blocks/basetable.py:
--------------------------------------------------------------------------------

```python
from typing import List

from marker.schema import BlockTypes
from marker.schema.blocks import Block, BlockOutput
from marker.schema.blocks.tablecell import TableCell


class BaseTable(Block):
    block_type: BlockTypes | None = None
    html: str | None = None

    @staticmethod
    def format_cells(
        document, child_blocks, block_config, child_cells: List[TableCell] | None = None
    ):
        if child_cells is None:
            child_cells: List[TableCell] = [
                document.get_block(c.id)
                for c in child_blocks
                if c.id.block_type == BlockTypes.TableCell
            ]

        unique_rows = sorted(list(set([c.row_id for c in child_cells])))
        html_repr = "<table><tbody>"
        for row_id in unique_rows:
            row_cells = sorted(
                [c for c in child_cells if c.row_id == row_id], key=lambda x: x.col_id
            )
            html_repr += "<tr>"
            for cell in row_cells:
                html_repr += cell.assemble_html(
                    document, child_blocks, None, block_config
                )
            html_repr += "</tr>"
        html_repr += "</tbody></table>"
        return html_repr

    def assemble_html(
        self,
        document,
        child_blocks: List[BlockOutput],
        parent_structure=None,
        block_config: dict | None = None,
    ):
        # Filter out the table cells, so they don't render twice
        child_ref_blocks = [
            block
            for block in child_blocks
            if block.id.block_type == BlockTypes.Reference
        ]
        template = super().assemble_html(
            document, child_ref_blocks, parent_structure, block_config
        )

        child_block_types = set([c.id.block_type for c in child_blocks])
        if self.html:
            # LLM processor
            return template + self.html
        elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types:
            # Table processor
            return template + self.format_cells(document, child_blocks, block_config)
        else:
            # Default text lines and spans
            return f"<p>{template}</p>"

```

--------------------------------------------------------------------------------
/benchmarks/overall/download/mathpix.py:
--------------------------------------------------------------------------------

```python
import json
import time

import requests

from benchmarks.overall.download.base import Downloader


class MathpixDownloader(Downloader):
    service = "mathpix"

    def get_html(self, pdf_bytes):
        headers = {
            "app_id": self.app_id,
            "app_key": self.api_key,
        }
        start = time.time()
        pdf_id = mathpix_request(pdf_bytes, headers)
        status = mathpix_status(pdf_id, headers)
        if status in ["processing", "error"]:
            md = ""
        else:
            md = mathpix_results(pdf_id, headers)
        end = time.time()
        if isinstance(md, bytes):
            md = md.decode("utf-8")

        return {
            "md": md,
            "time": end - start
        }

def mathpix_request(buffer, headers):
    response = requests.post("https://api.mathpix.com/v3/pdf",
        headers=headers,
        data={
            "options_json": json.dumps(
                {
                    "conversion_formats": {
                        "md": True,
                        "html": True
                    }
                }
            )
        },
        files={
            "file": buffer
        }
    )
    data = response.json()
    pdf_id = data["pdf_id"]
    return pdf_id

def mathpix_status(pdf_id, headers):
    max_iters = 120
    i = 0
    status = "processing"
    status2 = "processing"
    while i < max_iters:
        time.sleep(1)
        response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}",
            headers=headers
        )
        status_resp = response.json()
        if "conversion_status" not in status_resp:
            continue
        status = status_resp["conversion_status"]["md"]["status"]
        status2 = status_resp["conversion_status"]["html"]["status"]
        if status == "completed" and status2 == "completed":
            break
        elif status == "error" or status2 == "error":
            break
    out_status = "completed" if status == "completed" and status2 == "completed" else "error"
    return out_status

def mathpix_results(pdf_id, headers, ext="md"):
    response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}.{ext}",
        headers=headers
    )
    return response.content

```

--------------------------------------------------------------------------------
/tests/builders/test_garbled_pdf.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.builders.document import DocumentBuilder
from marker.builders.line import LineBuilder
from marker.processors.table import TableProcessor
from marker.schema import BlockTypes


@pytest.mark.filename("water_damage.pdf")
def test_garbled_pdf(pdf_document, recognition_model, table_rec_model, detection_model):
    assert pdf_document.pages[0].structure[0] == "/page/0/Table/0"

    table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
    assert table_block.block_type == BlockTypes.Table
    assert table_block.structure[0] == "/page/0/Line/10"

    table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
    assert table_cell.block_type == BlockTypes.Line

    # We don't OCR in the initial pass, only with the TableProcessor
    processor = TableProcessor(recognition_model, table_rec_model, detection_model)
    processor(pdf_document)

    table = pdf_document.pages[0].contained_blocks(pdf_document, (BlockTypes.Table,))[0]
    assert "варіант" in table.raw_text(pdf_document)

    table_cell = pdf_document.pages[0].get_block(table_block.structure[0])
    assert table_cell.block_type == BlockTypes.TableCell


@pytest.mark.filename("hindi_judgement.pdf")
@pytest.mark.config({"page_range": [2, 3], "disable_ocr": True})
def test_garbled_builder(config, doc_provider, detection_model, ocr_error_model):
    line_builder = LineBuilder(detection_model, ocr_error_model, config)
    builder = DocumentBuilder(config)
    document = builder.build_document(doc_provider)

    bad_ocr_results = line_builder.ocr_error_detection(
        document.pages, doc_provider.page_lines
    )
    assert len(bad_ocr_results.labels) == 2
    assert any([label == "bad" for label in bad_ocr_results.labels])


@pytest.mark.filename("adversarial.pdf")
@pytest.mark.config({"page_range": [2, 3], "disable_ocr": True})
def test_nongarbled_builder(config, doc_provider, detection_model, ocr_error_model):
    line_builder = LineBuilder(detection_model, ocr_error_model, config)
    builder = DocumentBuilder(config)
    document = builder.build_document(doc_provider)

    bad_ocr_results = line_builder.ocr_error_detection(
        document.pages, doc_provider.page_lines
    )
    assert len(bad_ocr_results.labels) == 2
    assert all([label == "good" for label in bad_ocr_results.labels])

```

--------------------------------------------------------------------------------
/marker/processors/order.py:
--------------------------------------------------------------------------------

```python
from statistics import mean
from collections import defaultdict

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document


class OrderProcessor(BaseProcessor):
    """
    A processor for sorting the blocks in order if needed.  This can help when the layout image was sliced.
    """
    block_types = tuple()

    def __call__(self, document: Document):
        for page in document.pages:
            # Skip OCRed pages
            if page.text_extraction_method != "pdftext":
                continue

            # Skip pages without layout slicing
            if not page.layout_sliced:
                continue

            block_idxs = defaultdict(int)
            for block_id in page.structure:
                block = document.get_block(block_id)
                spans = block.contained_blocks(document, (BlockTypes.Span, ))
                if len(spans) == 0:
                    continue

                # Avg span position in original PDF
                block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2

            for block_id in page.structure:
                # Already assigned block id via span position
                if block_idxs[block_id] > 0:
                    continue

                block = document.get_block(block_id)
                prev_block = document.get_prev_block(block)
                next_block = document.get_next_block(block)

                block_idx_add = 0
                if prev_block:
                    block_idx_add = 1

                while prev_block and prev_block.id not in block_idxs:
                    prev_block = document.get_prev_block(prev_block)
                    block_idx_add += 1

                if not prev_block:
                    block_idx_add = -1
                    while next_block and next_block.id not in block_idxs:
                        next_block = document.get_next_block(next_block)
                        block_idx_add -= 1

                if not next_block and not prev_block:
                    pass
                elif prev_block:
                    block_idxs[block_id] = block_idxs[prev_block.id] + block_idx_add
                else:
                    block_idxs[block_id] = block_idxs[next_block.id] + block_idx_add

            page.structure = sorted(page.structure, key=lambda x: block_idxs[x])


```

--------------------------------------------------------------------------------
/tests/converters/test_extraction_converter.py:
--------------------------------------------------------------------------------

```python
import json
import pytest

from marker.converters.extraction import ExtractionConverter
from marker.extractors.page import PageExtractionSchema
from marker.extractors.document import DocumentExtractionSchema
from marker.services import BaseService


class MockLLMService(BaseService):
    def __call__(self, prompt, image=None, page=None, response_schema=None, **kwargs):
        if response_schema == PageExtractionSchema:
            return {
                "description": "Mock extraction description",
                "detailed_notes": "Mock detailed notes for page extraction",
            }
        elif response_schema == DocumentExtractionSchema:
            return {
                "analysis": "Mock document analysis",
                "document_json": json.dumps({"test_key": "test_value"}),
            }
        return {}


@pytest.fixture
def mock_llm_service():
    return MockLLMService


@pytest.fixture
def extraction_converter(config, model_dict, mock_llm_service):
    test_schema = {
        "title": "TestSchema",
        "type": "object",
        "properties": {"test_key": {"title": "Test Key", "type": "string"}},
        "required": ["test_key"],
    }

    config["page_schema"] = json.dumps(test_schema)
    config["output_format"] = "markdown"
    model_dict["llm_service"] = mock_llm_service

    converter = ExtractionConverter(
        artifact_dict=model_dict, processor_list=None, config=config
    )
    converter.llm_service = mock_llm_service
    converter.default_llm_service = MockLLMService
    return converter


@pytest.mark.config({"page_range": [0]})
def test_extraction_converter(config, model_dict, mock_llm_service, temp_doc):
    config["page_schema"] = "invalid json"

    model_dict["llm_service"] = mock_llm_service
    converter = ExtractionConverter(
        artifact_dict=model_dict, processor_list=None, config=config
    )
    converter.artifact_dict["llm_service"] = mock_llm_service()

    results = converter(temp_doc.name)
    assert results.document_json == '{"test_key": "test_value"}'


@pytest.mark.config({"page_range": [0, 1]})
def test_extraction_converter_multiple_pages(extraction_converter, temp_doc):
    result = extraction_converter(temp_doc.name)

    assert result is not None
    assert result.document_json is not None
    assert json.loads(result.document_json) == {"test_key": "test_value"}
    assert result.analysis == "Mock document analysis"

```

--------------------------------------------------------------------------------
/marker/processors/llm/llm_meta.py:
--------------------------------------------------------------------------------

```python
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict, Any

from marker.logger import get_logger
from tqdm import tqdm

from marker.processors.llm import BaseLLMSimpleBlockProcessor, BaseLLMProcessor
from marker.schema.document import Document
from marker.services import BaseService

logger = get_logger()


class LLMSimpleBlockMetaProcessor(BaseLLMProcessor):
    """
    A wrapper for simple LLM processors, so they can all run in parallel.
    """

    def __init__(
        self,
        processor_lst: List[BaseLLMSimpleBlockProcessor],
        llm_service: BaseService,
        config=None,
    ):
        super().__init__(llm_service, config)
        self.processors = processor_lst

    def __call__(self, document: Document):
        if not self.use_llm or self.llm_service is None:
            return

        total = sum(
            [len(processor.inference_blocks(document)) for processor in self.processors]
        )
        pbar = tqdm(
            desc="LLM processors running", disable=self.disable_tqdm, total=total
        )

        all_prompts = [
            processor.block_prompts(document) for processor in self.processors
        ]
        pending = []
        futures_map = {}
        with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
            for i, prompt_lst in enumerate(all_prompts):
                for prompt in prompt_lst:
                    future = executor.submit(self.get_response, prompt)
                    pending.append(future)
                    futures_map[future] = {"processor_idx": i, "prompt_data": prompt}

            for future in pending:
                try:
                    result = future.result()
                    future_data = futures_map.pop(future)
                    processor: BaseLLMSimpleBlockProcessor = self.processors[
                        future_data["processor_idx"]
                    ]
                    # finalize the result
                    processor(result, future_data["prompt_data"], document)
                except Exception as e:
                    logger.warning(f"Error processing LLM response: {e}")

                pbar.update(1)

        pbar.close()

    def get_response(self, prompt_data: Dict[str, Any]):
        return self.llm_service(
            prompt_data["prompt"],
            prompt_data["image"],
            prompt_data["block"],
            prompt_data["schema"],
        )

```

--------------------------------------------------------------------------------
/marker/converters/__init__.py:
--------------------------------------------------------------------------------

```python
import inspect
from typing import Optional, List, Type

from pydantic import BaseModel

from marker.processors import BaseProcessor
from marker.processors.llm import BaseLLMSimpleBlockProcessor
from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor
from marker.util import assign_config, download_font


class BaseConverter:
    def __init__(self, config: Optional[BaseModel | dict] = None):
        assign_config(self, config)
        self.config = config
        self.llm_service = None

        # Download render font, needed for some providers
        download_font()

    def __call__(self, *args, **kwargs):
        raise NotImplementedError

    def resolve_dependencies(self, cls):
        init_signature = inspect.signature(cls.__init__)
        parameters = init_signature.parameters

        resolved_kwargs = {}
        for param_name, param in parameters.items():
            if param_name == 'self':
                continue
            elif param_name == 'config':
                resolved_kwargs[param_name] = self.config
            elif param.name in self.artifact_dict:
                resolved_kwargs[param_name] = self.artifact_dict[param_name]
            elif param.default != inspect.Parameter.empty:
                resolved_kwargs[param_name] = param.default
            else:
                raise ValueError(f"Cannot resolve dependency for parameter: {param_name}")

        return cls(**resolved_kwargs)

    def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) -> List[BaseProcessor]:
        processors = []
        for processor_cls in processor_cls_lst:
            processors.append(self.resolve_dependencies(processor_cls))

        simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
        other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)]

        if not simple_llm_processors:
            return processors

        llm_positions = [i for i, p in enumerate(processors) if issubclass(type(p), BaseLLMSimpleBlockProcessor)]
        insert_position = max(0, llm_positions[-1] - len(simple_llm_processors) + 1)

        meta_processor = LLMSimpleBlockMetaProcessor(
            processor_lst=simple_llm_processors,
            llm_service=self.llm_service,
            config=self.config,
        )
        other_processors.insert(insert_position, meta_processor)
        return other_processors
```

--------------------------------------------------------------------------------
/marker/providers/registry.py:
--------------------------------------------------------------------------------

```python
import filetype
import filetype.match as file_match
from bs4 import BeautifulSoup
from filetype.types import archive, document, IMAGE

from marker.providers.document import DocumentProvider
from marker.providers.epub import EpubProvider
from marker.providers.html import HTMLProvider
from marker.providers.image import ImageProvider
from marker.providers.pdf import PdfProvider
from marker.providers.powerpoint import PowerPointProvider
from marker.providers.spreadsheet import SpreadSheetProvider

DOCTYPE_MATCHERS = {
    "image": IMAGE,
    "pdf": [
        archive.Pdf,
    ],
    "epub": [
        archive.Epub,
    ],
    "doc": [document.Docx],
    "xls": [document.Xlsx],
    "ppt": [document.Pptx],
}


def load_matchers(doctype: str):
    return [cls() for cls in DOCTYPE_MATCHERS[doctype]]


def load_extensions(doctype: str):
    return [cls.EXTENSION for cls in DOCTYPE_MATCHERS[doctype]]


def provider_from_ext(filepath: str):
    ext = filepath.rsplit(".", 1)[-1].strip()
    if not ext:
        return PdfProvider

    if ext in load_extensions("image"):
        return ImageProvider
    if ext in load_extensions("pdf"):
        return PdfProvider
    if ext in load_extensions("doc"):
        return DocumentProvider
    if ext in load_extensions("xls"):
        return SpreadSheetProvider
    if ext in load_extensions("ppt"):
        return PowerPointProvider
    if ext in load_extensions("epub"):
        return EpubProvider
    if ext in ["html"]:
        return HTMLProvider

    return PdfProvider


def provider_from_filepath(filepath: str):
    if filetype.image_match(filepath) is not None:
        return ImageProvider
    if file_match(filepath, load_matchers("pdf")) is not None:
        return PdfProvider
    if file_match(filepath, load_matchers("epub")) is not None:
        return EpubProvider
    if file_match(filepath, load_matchers("doc")) is not None:
        return DocumentProvider
    if file_match(filepath, load_matchers("xls")) is not None:
        return SpreadSheetProvider
    if file_match(filepath, load_matchers("ppt")) is not None:
        return PowerPointProvider

    try:
        with open(filepath, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f.read(), "html.parser")
            # Check if there are any HTML tags
            if bool(soup.find()):
                return HTMLProvider
    except Exception:
        pass

    # Fallback if we incorrectly detect the file type
    return provider_from_ext(filepath)

```

--------------------------------------------------------------------------------
/marker/processors/blank_page.py:
--------------------------------------------------------------------------------

```python
from typing import Annotated

from PIL import Image
import numpy as np
import cv2

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.blocks import Block
from marker.schema.document import Document

from marker.logger import get_logger

logger = get_logger()


class BlankPageProcessor(BaseProcessor):
    """
    A processor to filter out blank pages detected as a single layout block
    """

    full_page_block_intersection_threshold: Annotated[
        float, "Threshold to detect blank pages at"
    ] = 0.8
    filter_blank_pages: Annotated[bool, "Remove blank pages detected as images."] = (
        False
    )

    def is_blank(self, image: Image.Image):
        image = np.asarray(image)
        if image.size == 0 or image.shape[0] == 0 or image.shape[1] == 0:
            # Handle empty image case
            return True

        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        gray = cv2.GaussianBlur(gray, (7, 7), 0)

        # Adaptive threshold (inverse for text as white)
        binarized = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15
        )

        num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
            binarized, connectivity=8
        )
        cleaned = np.zeros_like(binarized)
        for i in range(1, num_labels):  # skip background
            cleaned[labels == i] = 255

        kernel = np.ones((1, 5), np.uint8)
        dilated = cv2.dilate(cleaned, kernel, iterations=3)
        b = dilated / 255
        return b.sum() == 0

    def __call__(self, document: Document):
        if not self.filter_blank_pages:
            return

        for page in document.pages:
            structure_blocks = page.structure_blocks(document)
            if not structure_blocks or len(structure_blocks) > 1:
                continue

            full_page_block: Block = structure_blocks[0]

            conditions = [
                full_page_block.block_type in [BlockTypes.Picture, BlockTypes.Figure],
                self.is_blank(full_page_block.get_image(document)),
                page.polygon.intersection_area(full_page_block.polygon)
                > self.full_page_block_intersection_threshold,
            ]

            if all(conditions):
                logger.debug(f"Removing blank block {full_page_block.id}")
                page.remove_structure_items([full_page_block.id])
                full_page_block.removed = True

```

--------------------------------------------------------------------------------
/marker/processors/util.py:
--------------------------------------------------------------------------------

```python
import re

from bs4 import BeautifulSoup

from marker.schema import BlockTypes
from marker.schema.groups import PageGroup
from marker.schema.registry import get_block_class
from marker.schema.text import Line


def escape_latex_commands(text: str):
    text = (text
            .replace('\n', '\\n')
            .replace('\t', '\\t')
            .replace('\r', '\\r'))
    return text


def add_math_spans_to_line(corrected_text: str, text_line: Line, page: PageGroup):
    SpanClass = get_block_class(BlockTypes.Span)
    corrected_spans = text_to_spans(corrected_text)

    for span_idx, span in enumerate(corrected_spans):
        if span_idx == len(corrected_spans) - 1:
            span['content'] += "\n"

        span_block = page.add_full_block(
            SpanClass(
                polygon=text_line.polygon,
                text=span['content'],
                font='Unknown',
                font_weight=0,
                font_size=0,
                minimum_position=0,
                maximum_position=0,
                formats=[span['type']],
                url=span.get('url'),
                page_id=text_line.page_id,
                text_extraction_method="gemini",
                has_superscript=span["has_superscript"],
                has_subscript=span["has_subscript"]
            )
        )
        text_line.structure.append(span_block.id)


def text_to_spans(text):
    soup = BeautifulSoup(text, 'html.parser')

    tag_types = {
        'b': 'bold',
        'i': 'italic',
        'math': 'math',
        'sub': 'plain',
        'sup': 'plain',
        'span': 'plain'
    }
    spans = []

    for element in soup.descendants:
        if not len(list(element.parents)) == 1:
            continue

        url = element.attrs.get('href') if hasattr(element, 'attrs') else None

        if element.name in tag_types:
            text = element.get_text()
            if element.name == "math":
                text = escape_latex_commands(text)
            spans.append({
                'type': tag_types[element.name],
                'content': text,
                'url': url,
                "has_superscript": element.name == "sup",
                "has_subscript": element.name == "sub"
            })
        elif element.string:
            spans.append({
                'type': 'plain',
                'content': element.string,
                'url': url,
                "has_superscript": False,
                "has_subscript": False
            })

    return spans
```

--------------------------------------------------------------------------------
/marker/providers/__init__.py:
--------------------------------------------------------------------------------

```python
from copy import deepcopy
from typing import List, Optional, Dict

from PIL import Image
from pydantic import BaseModel

from pdftext.schema import Reference

from marker.logger import configure_logging
from marker.schema.polygon import PolygonBox
from marker.schema.text import Span
from marker.schema.text.char import Char
from marker.schema.text.line import Line
from marker.settings import settings
from marker.util import assign_config

configure_logging()


class ProviderOutput(BaseModel):
    line: Line
    spans: List[Span]
    chars: Optional[List[List[Char]]] = None

    @property
    def raw_text(self):
        return "".join(span.text for span in self.spans)

    def __hash__(self):
        return hash(tuple(self.line.polygon.bbox))

    def merge(self, other: "ProviderOutput"):
        new_output = deepcopy(self)
        other_copy = deepcopy(other)

        new_output.spans.extend(other_copy.spans)
        if new_output.chars is not None and other_copy.chars is not None:
            new_output.chars.extend(other_copy.chars)
        elif other_copy.chars is not None:
            new_output.chars = other_copy.chars

        new_output.line.polygon = new_output.line.polygon.merge(
            [other_copy.line.polygon]
        )
        return new_output


ProviderPageLines = Dict[int, List[ProviderOutput]]


class BaseProvider:
    def __init__(self, filepath: str, config: Optional[BaseModel | dict] = None):
        assign_config(self, config)
        self.filepath = filepath

    def __len__(self):
        pass

    def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
        pass

    def get_page_bbox(self, idx: int) -> PolygonBox | None:
        pass

    def get_page_lines(self, idx: int) -> List[Line]:
        pass

    def get_page_refs(self, idx: int) -> List[Reference]:
        pass

    def __enter__(self):
        return self

    @staticmethod
    def get_font_css():
        from weasyprint import CSS
        from weasyprint.text.fonts import FontConfiguration

        font_config = FontConfiguration()
        css = CSS(
            string=f"""
            @font-face {{
                font-family: GoNotoCurrent-Regular;
                src: url({settings.FONT_PATH});
                font-display: swap;
            }}
            body {{
                font-family: {settings.FONT_NAME.split(".")[0]}, sans-serif;
                font-variant-ligatures: none;
                font-feature-settings: "liga" 0;
                text-rendering: optimizeLegibility;
            }}
            """,
            font_config=font_config,
        )
        return css

```

--------------------------------------------------------------------------------
/marker/providers/document.py:
--------------------------------------------------------------------------------

```python
import base64
import os
import re
import tempfile
from io import BytesIO

from PIL import Image
from marker.logger import get_logger

from marker.providers.pdf import PdfProvider

logger = get_logger()

css = """
@page {
    size: A4;
    margin: 2cm;
}

img {
    max-width: 100%;
    max-height: 25cm;
    object-fit: contain;
    margin: 12pt auto;
}

div, p {
    max-width: 100%;
    word-break: break-word;
    font-size: 10pt;
}

table {
    width: 100%;
    border-collapse: collapse;
    break-inside: auto;
    font-size: 10pt;
}

tr {
    break-inside: avoid;
    page-break-inside: avoid;
}

td {
    border: 0.75pt solid #000;
    padding: 6pt;
}
"""


class DocumentProvider(PdfProvider):
    def __init__(self, filepath: str, config=None):
        temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
        self.temp_pdf_path = temp_pdf.name
        temp_pdf.close()

        # Convert DOCX to PDF
        try:
            self.convert_docx_to_pdf(filepath)
        except Exception as e:
            raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")

        # Initialize the PDF provider with the temp pdf path
        super().__init__(self.temp_pdf_path, config)

    def __del__(self):
        if os.path.exists(self.temp_pdf_path):
            os.remove(self.temp_pdf_path)

    def convert_docx_to_pdf(self, filepath: str):
        from weasyprint import CSS, HTML
        import mammoth

        with open(filepath, "rb") as docx_file:
            # we convert the docx to HTML
            result = mammoth.convert_to_html(docx_file)
            html = result.value

            # We convert the HTML into a PDF
            HTML(string=self._preprocess_base64_images(html)).write_pdf(
                self.temp_pdf_path, stylesheets=[CSS(string=css), self.get_font_css()]
            )

    @staticmethod
    def _preprocess_base64_images(html_content):
        pattern = r'data:([^;]+);base64,([^"\'>\s]+)'

        def convert_image(match):
            try:
                img_data = base64.b64decode(match.group(2))

                with BytesIO(img_data) as bio:
                    with Image.open(bio) as img:
                        output = BytesIO()
                        img.save(output, format=img.format)
                        new_base64 = base64.b64encode(output.getvalue()).decode()
                        return f"data:{match.group(1)};base64,{new_base64}"

            except Exception as e:
                logger.error(f"Failed to process image: {e}")
                return ""  # we ditch broken images as that breaks the PDF creation down the line

        return re.sub(pattern, convert_image, html_content)

```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/olmocr.py:
--------------------------------------------------------------------------------

```python
import base64
import json
import tempfile
import time
from io import BytesIO

import torch
from PIL import Image

from benchmarks.overall.methods import BaseMethod, BenchmarkResult


def convert_single_page(filename: str, model, processor, device):
    from olmocr.data.renderpdf import render_pdf_to_base64png
    from olmocr.prompts import build_finetuning_prompt
    from olmocr.prompts.anchor import get_anchor_text

    image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1024)

    # Build the prompt, using document metadata
    anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=4000)
    prompt = build_finetuning_prompt(anchor_text)

    # Build the full prompt
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
            ],
        }
    ]

    # Apply the chat template and processor
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    main_image = Image.open(BytesIO(base64.b64decode(image_base64)))

    inputs = processor(
        text=[text],
        images=[main_image],
        padding=True,
        return_tensors="pt",
    )
    inputs = {key: value.to(device) for (key, value) in inputs.items()}

    # Generate the output
    output = model.generate(
        **inputs,
        temperature=0.8,
        max_new_tokens=8192,
        num_return_sequences=1,
        do_sample=True,
    )

    # Decode the output
    prompt_length = inputs["input_ids"].shape[1]
    new_tokens = output[:, prompt_length:]
    text_output = processor.tokenizer.batch_decode(
        new_tokens, skip_special_tokens=True
    )[0]

    try:
        text_output = json.loads(text_output)
        text = text_output["natural_text"]
    except Exception:
        try:
            text = text_output.split("natural_text")[1].strip()
        except Exception:
            text = ""

    return text


class OlmOCRMethod(BaseMethod):
    olmocr_model: dict = None
    use_llm: bool = False

    def __call__(self, sample) -> BenchmarkResult:
        pdf_bytes = sample["pdf"]  # This is a single page PDF

        with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
            f.write(pdf_bytes)
            start = time.time()
            result = convert_single_page(f.name, self.olmocr_model["model"], self.olmocr_model["processor"], self.olmocr_model["model"].device)
            total = time.time() - start

        return {
            "markdown": result,
            "time": total
        }

```

--------------------------------------------------------------------------------
/tests/renderers/test_markdown_renderer.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.renderers.markdown import MarkdownRenderer
from marker.schema import BlockTypes
from marker.schema.blocks import TableCell


@pytest.mark.config({"page_range": [0], "disable_ocr": True})
def test_markdown_renderer(pdf_document):
    renderer = MarkdownRenderer()
    md = renderer(pdf_document).markdown

    # Verify markdown
    assert "# Subspace Adversarial Training" in md


@pytest.mark.config({"page_range": [0]})
def test_markdown_renderer_auto_ocr(pdf_document):
    renderer = MarkdownRenderer()
    md = renderer(pdf_document).markdown

    # Verify markdown
    assert "Subspace Adversarial Training" in md


@pytest.mark.config({"page_range": [0, 1], "paginate_output": True})
def test_markdown_renderer_pagination(pdf_document):
    renderer = MarkdownRenderer({"paginate_output": True})
    md = renderer(pdf_document).markdown

    assert "\n\n{0}-" in md
    assert "\n\n{1}-" in md


@pytest.mark.config({"page_range": [0, 1], "paginate_output": True})
def test_markdown_renderer_pagination_blank_last_page(pdf_document):
    # Clear all children and structure from the last page to simulate a blank page
    last_page = pdf_document.pages[-1]
    last_page.children = []
    last_page.structure = []

    renderer = MarkdownRenderer({"paginate_output": True})
    md = renderer(pdf_document).markdown

    # Should end with pagination marker and preserve trailing newlines
    assert md.endswith("}\n\n") or md.endswith(
        "}------------------------------------------------\n\n"
    )


@pytest.mark.config({"page_range": [0, 1]})
def test_markdown_renderer_metadata(pdf_document):
    renderer = MarkdownRenderer({"paginate_output": True})
    metadata = renderer(pdf_document).metadata
    assert "table_of_contents" in metadata


@pytest.mark.config({"page_range": [0, 1]})
def test_markdown_renderer_images(pdf_document):
    renderer = MarkdownRenderer({"extract_images": False})
    markdown_output = renderer(pdf_document)

    assert len(markdown_output.images) == 0
    assert "![](" not in markdown_output.markdown


@pytest.mark.config({"page_range": [5]})
def test_markdown_renderer_tables(pdf_document):
    table = pdf_document.contained_blocks((BlockTypes.Table,))[0]
    page = pdf_document.pages[0]

    cell = TableCell(
        polygon=table.polygon,
        text_lines=["54<i>.45</i>67<br>89<math>x</math>"],
        rowspan=1,
        colspan=1,
        row_id=0,
        col_id=0,
        is_header=False,
        page_id=page.page_id,
    )
    page.add_full_block(cell)
    table.structure = []
    table.add_structure(cell)

    renderer = MarkdownRenderer()
    md = renderer(pdf_document).markdown
    assert "54 <i>.45</i> 67<br>89 $x$" in md

```

--------------------------------------------------------------------------------
/marker/schema/registry.py:
--------------------------------------------------------------------------------

```python
from typing import Dict, Type
from importlib import import_module

from marker.schema import BlockTypes
from marker.schema.blocks import (
    Block,
    Caption,
    Code,
    Equation,
    Figure,
    Footnote,
    Form,
    Handwriting,
    InlineMath,
    ListItem,
    PageFooter,
    PageHeader,
    Picture,
    SectionHeader,
    Table,
    TableOfContents,
    Text,
    ComplexRegion,
    TableCell,
    Reference,
)
from marker.schema.document import Document
from marker.schema.groups import (
    FigureGroup,
    ListGroup,
    PageGroup,
    PictureGroup,
    TableGroup,
)
from marker.schema.text import Line, Span
from marker.schema.text.char import Char

BLOCK_REGISTRY: Dict[BlockTypes, str] = {}


def register_block_class(block_type: BlockTypes, block_cls: Type[Block]):
    BLOCK_REGISTRY[block_type] = f"{block_cls.__module__}.{block_cls.__name__}"


def get_block_class(block_type: BlockTypes) -> Type[Block]:
    class_path = BLOCK_REGISTRY[block_type]
    module_name, class_name = class_path.rsplit(".", 1)
    module = import_module(module_name)
    return getattr(module, class_name)


register_block_class(BlockTypes.Line, Line)
register_block_class(BlockTypes.Span, Span)
register_block_class(BlockTypes.Char, Char)
register_block_class(BlockTypes.FigureGroup, FigureGroup)
register_block_class(BlockTypes.TableGroup, TableGroup)
register_block_class(BlockTypes.ListGroup, ListGroup)
register_block_class(BlockTypes.PictureGroup, PictureGroup)
register_block_class(BlockTypes.Page, PageGroup)
register_block_class(BlockTypes.Caption, Caption)
register_block_class(BlockTypes.Code, Code)
register_block_class(BlockTypes.Figure, Figure)
register_block_class(BlockTypes.Footnote, Footnote)
register_block_class(BlockTypes.Form, Form)
register_block_class(BlockTypes.Equation, Equation)
register_block_class(BlockTypes.Handwriting, Handwriting)
register_block_class(BlockTypes.TextInlineMath, InlineMath)
register_block_class(BlockTypes.ListItem, ListItem)
register_block_class(BlockTypes.PageFooter, PageFooter)
register_block_class(BlockTypes.PageHeader, PageHeader)
register_block_class(BlockTypes.Picture, Picture)
register_block_class(BlockTypes.SectionHeader, SectionHeader)
register_block_class(BlockTypes.Table, Table)
register_block_class(BlockTypes.Text, Text)
register_block_class(BlockTypes.TableOfContents, TableOfContents)
register_block_class(BlockTypes.ComplexRegion, ComplexRegion)
register_block_class(BlockTypes.TableCell, TableCell)
register_block_class(BlockTypes.Reference, Reference)
register_block_class(BlockTypes.Document, Document)

assert len(BLOCK_REGISTRY) == len(BlockTypes)
assert all(
    [
        get_block_class(k).model_fields["block_type"].default == k
        for k, _ in BLOCK_REGISTRY.items()
    ]
)

```

--------------------------------------------------------------------------------
/marker/processors/blockquote.py:
--------------------------------------------------------------------------------

```python
from typing import Annotated, Tuple

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document


class BlockquoteProcessor(BaseProcessor):
    """
    A processor for tagging blockquotes.
    """
    block_types: Annotated[
        Tuple[BlockTypes],
        "The block types to process.",
    ] = (BlockTypes.Text, BlockTypes.TextInlineMath)
    min_x_indent: Annotated[
        float,
        "The minimum horizontal indentation required to consider a block as part of a blockquote.",
        "Expressed as a percentage of the block width.",
    ] = 0.1
    x_start_tolerance: Annotated[
        float,
        "The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.",
        "Expressed as a percentage of the block width.",
    ] = 0.01
    x_end_tolerance: Annotated[
        float,
        "The maximum allowable difference between the ending x-coordinates of consecutive blocks to consider them aligned.",
        "Expressed as a percentage of the block width.",
    ] = 0.01

    def __init__(self, config):
        super().__init__(config)

    def __call__(self, document: Document):
        for page in document.pages:
            for block in page.contained_blocks(document, self.block_types):
                if block.structure is None:
                    continue

                if not len(block.structure) >= 2:
                    continue

                next_block = page.get_next_block(block)
                if next_block is None:
                    continue
                if next_block.block_type not in self.block_types:
                    continue
                if next_block.structure is None:
                    continue
                if next_block.ignore_for_output:
                    continue

                matching_x_end = abs(next_block.polygon.x_end - block.polygon.x_end) < self.x_end_tolerance * block.polygon.width
                matching_x_start = abs(next_block.polygon.x_start - block.polygon.x_start) < self.x_start_tolerance * block.polygon.width
                x_indent = next_block.polygon.x_start > block.polygon.x_start + (self.min_x_indent * block.polygon.width)
                y_indent = next_block.polygon.y_start > block.polygon.y_end

                if block.blockquote:
                    next_block.blockquote = (matching_x_end and matching_x_start) or (x_indent and y_indent)
                    next_block.blockquote_level = block.blockquote_level
                    if (x_indent and y_indent):
                        next_block.blockquote_level += 1
                elif len(next_block.structure) >= 2 and (x_indent and y_indent):
                    next_block.blockquote = True
                    next_block.blockquote_level = 1
```

--------------------------------------------------------------------------------
/tests/services/test_service_init.py:
--------------------------------------------------------------------------------

```python
import pytest

from marker.converters.pdf import PdfConverter
from marker.services.gemini import GoogleGeminiService
from marker.services.ollama import OllamaService
from marker.services.vertex import GoogleVertexService
from marker.services.openai import OpenAIService
from marker.services.azure_openai import AzureOpenAIService


@pytest.mark.output_format("markdown")
@pytest.mark.config({"page_range": [0]})
def test_empty_llm(pdf_converter: PdfConverter, temp_doc):
    assert pdf_converter.artifact_dict["llm_service"] is None
    assert pdf_converter.llm_service is None


def test_llm_no_keys(model_dict, config):
    with pytest.raises(AssertionError):
        PdfConverter(artifact_dict=model_dict, config={"use_llm": True})


@pytest.mark.output_format("markdown")
@pytest.mark.config({"page_range": [0], "use_llm": True, "gemini_api_key": "test"})
def test_llm_gemini(pdf_converter: PdfConverter, temp_doc):
    assert pdf_converter.artifact_dict["llm_service"] is not None
    assert isinstance(pdf_converter.llm_service, GoogleGeminiService)


@pytest.mark.output_format("markdown")
@pytest.mark.config(
    {
        "page_range": [0],
        "use_llm": True,
        "vertex_project_id": "test",
        "llm_service": "marker.services.vertex.GoogleVertexService",
    }
)
def test_llm_vertex(pdf_converter: PdfConverter, temp_doc):
    assert pdf_converter.artifact_dict["llm_service"] is not None
    assert isinstance(pdf_converter.llm_service, GoogleVertexService)


@pytest.mark.output_format("markdown")
@pytest.mark.config(
    {
        "page_range": [0],
        "use_llm": True,
        "llm_service": "marker.services.ollama.OllamaService",
    }
)
def test_llm_ollama(pdf_converter: PdfConverter, temp_doc):
    assert pdf_converter.artifact_dict["llm_service"] is not None
    assert isinstance(pdf_converter.llm_service, OllamaService)


@pytest.mark.output_format("markdown")
@pytest.mark.config(
    {
        "page_range": [0],
        "use_llm": True,
        "llm_service": "marker.services.openai.OpenAIService",
        "openai_api_key": "test",
    }
)
def test_llm_openai(pdf_converter: PdfConverter, temp_doc):
    assert pdf_converter.artifact_dict["llm_service"] is not None
    assert isinstance(pdf_converter.llm_service, OpenAIService)


@pytest.mark.output_format("markdown")
@pytest.mark.config(
    {
        "page_range": [0],
        "use_llm": True,
        "llm_service": "marker.services.azure_openai.AzureOpenAIService",
        "azure_endpoint": "https://example.openai.azure.com",
        "azure_api_key": "test",
        "deployment_name": "test-model",
        "azure_api_version": "1",
    }
)
def test_llm_azure_openai(pdf_converter: PdfConverter, temp_doc):
    assert pdf_converter.artifact_dict["llm_service"] is not None
    assert isinstance(pdf_converter.llm_service, AzureOpenAIService)

```

--------------------------------------------------------------------------------
/marker/converters/extraction.py:
--------------------------------------------------------------------------------

```python
import re
from typing import Annotated

from marker.builders.document import DocumentBuilder
from marker.builders.line import LineBuilder
from marker.builders.ocr import OcrBuilder
from marker.builders.structure import StructureBuilder
from marker.converters.pdf import PdfConverter
from marker.extractors.document import DocumentExtractor
from marker.extractors.page import PageExtractor
from marker.providers.registry import provider_from_filepath

from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput
from marker.renderers.markdown import MarkdownRenderer

from marker.logger import get_logger

logger = get_logger()


class ExtractionConverter(PdfConverter):
    pattern: str = r"{\d+\}-{48}\n\n"
    existing_markdown: Annotated[
        str, "Markdown that was already converted for extraction."
    ] = None

    def build_document(self, filepath: str):
        provider_cls = provider_from_filepath(filepath)
        layout_builder = self.resolve_dependencies(self.layout_builder_class)
        line_builder = self.resolve_dependencies(LineBuilder)
        ocr_builder = self.resolve_dependencies(OcrBuilder)
        provider = provider_cls(filepath, self.config)
        document = DocumentBuilder(self.config)(
            provider, layout_builder, line_builder, ocr_builder
        )
        structure_builder_cls = self.resolve_dependencies(StructureBuilder)
        structure_builder_cls(document)

        for processor in self.processor_list:
            processor(document)

        return document, provider

    def __call__(self, filepath: str) -> ExtractionOutput:
        self.config["paginate_output"] = True  # Ensure we can split the output properly
        self.config["output_format"] = (
            "markdown"  # Output must be markdown for extraction
        )
        markdown = self.existing_markdown

        if not markdown:
            document, provider = self.build_document(filepath)
            self.page_count = len(document.pages)
            renderer = self.resolve_dependencies(MarkdownRenderer)
            output = renderer(document)
            markdown = output.markdown

        output_pages = re.split(self.pattern, markdown)[1:]  # Split output into pages

        # This needs an LLM service for extraction, this sets it in the extractor
        if self.artifact_dict.get("llm_service") is None:
            self.artifact_dict["llm_service"] = self.resolve_dependencies(
                self.default_llm_service
            )

        page_extractor = self.resolve_dependencies(PageExtractor)
        document_extractor = self.resolve_dependencies(DocumentExtractor)
        renderer = self.resolve_dependencies(ExtractionRenderer)

        # Inference in parallel
        notes = page_extractor(output_pages)
        document_output = document_extractor(notes)

        merged = renderer(document_output, markdown)
        return merged

```
Page 1/7FirstPrevNextLast