datalab-to/marker # codebase.md

This is page 1 of 9. Use http://codebase.md/datalab-to/marker?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .github
│   ├── ISSUE_TEMPLATE
│   │   ├── breaking-bug-report.md
│   │   ├── feature_request.md
│   │   └── output-bug-report.md
│   └── workflows
│       ├── benchmarks.yml
│       ├── ci.yml
│       ├── cla.yml
│       ├── publish.yml
│       └── scripts.yml
├── .gitignore
├── .pre-commit-config.yaml
├── benchmarks
│   ├── __init__.py
│   ├── overall
│   │   ├── __init__.py
│   │   ├── display
│   │   │   ├── __init__.py
│   │   │   ├── dataset.py
│   │   │   └── table.py
│   │   ├── download
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── llamaparse.py
│   │   │   ├── main.py
│   │   │   ├── mathpix.py
│   │   │   └── mistral.py
│   │   ├── elo.py
│   │   ├── methods
│   │   │   ├── __init__.py
│   │   │   ├── docling.py
│   │   │   ├── gt.py
│   │   │   ├── llamaparse.py
│   │   │   ├── marker.py
│   │   │   ├── mathpix.py
│   │   │   ├── mistral.py
│   │   │   ├── olmocr.py
│   │   │   └── schema.py
│   │   ├── overall.py
│   │   ├── registry.py
│   │   ├── schema.py
│   │   └── scorers
│   │       ├── __init__.py
│   │       ├── clean.py
│   │       ├── heuristic.py
│   │       ├── llm.py
│   │       └── schema.py
│   ├── table
│   │   ├── __init__.py
│   │   ├── gemini.py
│   │   ├── inference.py
│   │   ├── scoring.py
│   │   └── table.py
│   ├── throughput
│   │   ├── __init__.py
│   │   └── main.py
│   └── verify_scores.py
├── chunk_convert.py
├── CLA.md
├── convert_single.py
├── convert.py
├── data
│   ├── .gitignore
│   ├── examples
│   │   ├── json
│   │   │   ├── multicolcnn.json
│   │   │   ├── switch_trans.json
│   │   │   └── thinkpython.json
│   │   └── markdown
│   │       ├── multicolcnn
│   │       │   ├── _page_1_Figure_0.jpeg
│   │       │   ├── _page_2_Picture_0.jpeg
│   │       │   ├── _page_6_Figure_0.jpeg
│   │       │   ├── _page_7_Figure_0.jpeg
│   │       │   ├── multicolcnn_meta.json
│   │       │   └── multicolcnn.md
│   │       ├── switch_transformers
│   │       │   ├── _page_11_Figure_4.jpeg
│   │       │   ├── _page_12_Figure_4.jpeg
│   │       │   ├── _page_13_Figure_2.jpeg
│   │       │   ├── _page_18_Figure_1.jpeg
│   │       │   ├── _page_18_Figure_3.jpeg
│   │       │   ├── _page_2_Figure_3.jpeg
│   │       │   ├── _page_20_Figure_1.jpeg
│   │       │   ├── _page_20_Figure_4.jpeg
│   │       │   ├── _page_27_Figure_1.jpeg
│   │       │   ├── _page_29_Figure_1.jpeg
│   │       │   ├── _page_30_Figure_1.jpeg
│   │       │   ├── _page_31_Figure_3.jpeg
│   │       │   ├── _page_4_Figure_1.jpeg
│   │       │   ├── _page_5_Figure_3.jpeg
│   │       │   ├── switch_trans_meta.json
│   │       │   └── switch_trans.md
│   │       └── thinkpython
│   │           ├── _page_109_Figure_1.jpeg
│   │           ├── _page_115_Figure_1.jpeg
│   │           ├── _page_116_Figure_3.jpeg
│   │           ├── _page_127_Figure_1.jpeg
│   │           ├── _page_128_Figure_1.jpeg
│   │           ├── _page_167_Figure_1.jpeg
│   │           ├── _page_169_Figure_1.jpeg
│   │           ├── _page_173_Figure_1.jpeg
│   │           ├── _page_190_Figure_1.jpeg
│   │           ├── _page_195_Figure_1.jpeg
│   │           ├── _page_205_Figure_1.jpeg
│   │           ├── _page_23_Figure_1.jpeg
│   │           ├── _page_23_Figure_3.jpeg
│   │           ├── _page_230_Figure_1.jpeg
│   │           ├── _page_233_Figure_1.jpeg
│   │           ├── _page_233_Figure_3.jpeg
│   │           ├── _page_234_Figure_1.jpeg
│   │           ├── _page_235_Figure_1.jpeg
│   │           ├── _page_236_Figure_1.jpeg
│   │           ├── _page_236_Figure_3.jpeg
│   │           ├── _page_237_Figure_1.jpeg
│   │           ├── _page_238_Figure_1.jpeg
│   │           ├── _page_46_Figure_1.jpeg
│   │           ├── _page_60_Figure_1.jpeg
│   │           ├── _page_60_Figure_3.jpeg
│   │           ├── _page_67_Figure_1.jpeg
│   │           ├── _page_71_Figure_1.jpeg
│   │           ├── _page_78_Figure_1.jpeg
│   │           ├── _page_85_Figure_1.jpeg
│   │           ├── _page_94_Figure_1.jpeg
│   │           ├── _page_99_Figure_17.jpeg
│   │           ├── _page_99_Figure_178.jpeg
│   │           ├── thinkpython_meta.json
│   │           └── thinkpython.md
│   ├── images
│   │   ├── overall.png
│   │   ├── per_doc.png
│   │   └── table.png
│   └── latex_to_md.sh
├── examples
│   ├── marker_modal_deployment.py
│   └── README.md
├── extraction_app.py
├── LICENSE
├── marker
│   ├── builders
│   │   ├── __init__.py
│   │   ├── document.py
│   │   ├── layout.py
│   │   ├── line.py
│   │   ├── ocr.py
│   │   └── structure.py
│   ├── config
│   │   ├── __init__.py
│   │   ├── crawler.py
│   │   ├── parser.py
│   │   └── printer.py
│   ├── converters
│   │   ├── __init__.py
│   │   ├── extraction.py
│   │   ├── ocr.py
│   │   ├── pdf.py
│   │   └── table.py
│   ├── extractors
│   │   ├── __init__.py
│   │   ├── document.py
│   │   └── page.py
│   ├── logger.py
│   ├── models.py
│   ├── output.py
│   ├── processors
│   │   ├── __init__.py
│   │   ├── blank_page.py
│   │   ├── block_relabel.py
│   │   ├── blockquote.py
│   │   ├── code.py
│   │   ├── debug.py
│   │   ├── document_toc.py
│   │   ├── equation.py
│   │   ├── footnote.py
│   │   ├── ignoretext.py
│   │   ├── line_merge.py
│   │   ├── line_numbers.py
│   │   ├── list.py
│   │   ├── llm
│   │   │   ├── __init__.py
│   │   │   ├── llm_complex.py
│   │   │   ├── llm_equation.py
│   │   │   ├── llm_form.py
│   │   │   ├── llm_handwriting.py
│   │   │   ├── llm_image_description.py
│   │   │   ├── llm_mathblock.py
│   │   │   ├── llm_meta.py
│   │   │   ├── llm_page_correction.py
│   │   │   ├── llm_sectionheader.py
│   │   │   ├── llm_table_merge.py
│   │   │   └── llm_table.py
│   │   ├── order.py
│   │   ├── page_header.py
│   │   ├── reference.py
│   │   ├── sectionheader.py
│   │   ├── table.py
│   │   ├── text.py
│   │   └── util.py
│   ├── providers
│   │   ├── __init__.py
│   │   ├── document.py
│   │   ├── epub.py
│   │   ├── html.py
│   │   ├── image.py
│   │   ├── pdf.py
│   │   ├── powerpoint.py
│   │   ├── registry.py
│   │   ├── spreadsheet.py
│   │   └── utils.py
│   ├── renderers
│   │   ├── __init__.py
│   │   ├── chunk.py
│   │   ├── extraction.py
│   │   ├── html.py
│   │   ├── json.py
│   │   ├── markdown.py
│   │   └── ocr_json.py
│   ├── schema
│   │   ├── __init__.py
│   │   ├── blocks
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── basetable.py
│   │   │   ├── caption.py
│   │   │   ├── code.py
│   │   │   ├── complexregion.py
│   │   │   ├── equation.py
│   │   │   ├── figure.py
│   │   │   ├── footnote.py
│   │   │   ├── form.py
│   │   │   ├── handwriting.py
│   │   │   ├── inlinemath.py
│   │   │   ├── listitem.py
│   │   │   ├── pagefooter.py
│   │   │   ├── pageheader.py
│   │   │   ├── picture.py
│   │   │   ├── reference.py
│   │   │   ├── sectionheader.py
│   │   │   ├── table.py
│   │   │   ├── tablecell.py
│   │   │   ├── text.py
│   │   │   └── toc.py
│   │   ├── document.py
│   │   ├── groups
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── figure.py
│   │   │   ├── list.py
│   │   │   ├── page.py
│   │   │   ├── picture.py
│   │   │   └── table.py
│   │   ├── polygon.py
│   │   ├── registry.py
│   │   └── text
│   │       ├── __init__.py
│   │       ├── char.py
│   │       ├── line.py
│   │       └── span.py
│   ├── scripts
│   │   ├── __init__.py
│   │   ├── chunk_convert.py
│   │   ├── chunk_convert.sh
│   │   ├── common.py
│   │   ├── convert_single.py
│   │   ├── convert.py
│   │   ├── extraction_app.py
│   │   ├── file_to_s3.py
│   │   ├── run_streamlit_app.py
│   │   ├── server.py
│   │   └── streamlit_app.py
│   ├── services
│   │   ├── __init__.py
│   │   ├── azure_openai.py
│   │   ├── claude.py
│   │   ├── gemini.py
│   │   ├── ollama.py
│   │   ├── openai.py
│   │   └── vertex.py
│   ├── settings.py
│   ├── util.py
│   └── utils
│       ├── __init__.py
│       ├── batch.py
│       ├── gpu.py
│       └── image.py
├── marker_app.py
├── marker_server.py
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── README.md
├── signatures
│   └── version1
│       └── cla.json
├── static
│   └── fonts
│       └── .gitignore
└── tests
    ├── builders
    │   ├── test_blank_page.py
    │   ├── test_document_builder.py
    │   ├── test_garbled_pdf.py
    │   ├── test_layout_replace.py
    │   ├── test_ocr_builder.py
    │   ├── test_ocr_pipeline.py
    │   ├── test_overriding.py
    │   ├── test_pdf_links.py
    │   ├── test_rotated_bboxes.py
    │   ├── test_strip_existing_ocr.py
    │   └── test_structure.py
    ├── config
    │   └── test_config.py
    ├── conftest.py
    ├── converters
    │   ├── test_extraction_converter.py
    │   ├── test_ocr_converter.py
    │   ├── test_pdf_converter.py
    │   └── test_table_converter.py
    ├── processors
    │   ├── test_document_toc_processor.py
    │   ├── test_equation_processor.py
    │   ├── test_footnote_processor.py
    │   ├── test_ignoretext.py
    │   ├── test_llm_processors.py
    │   ├── test_table_merge.py
    │   └── test_table_processor.py
    ├── providers
    │   ├── test_document_providers.py
    │   ├── test_image_provider.py
    │   └── test_pdf_provider.py
    ├── renderers
    │   ├── test_chunk_renderer.py
    │   ├── test_extract_images.py
    │   ├── test_html_renderer.py
    │   ├── test_json_renderer.py
    │   └── test_markdown_renderer.py
    ├── schema
    │   └── groups
    │       └── test_list_grouping.py
    ├── services
    │   └── test_service_init.py
    └── utils.py
```

# Files

--------------------------------------------------------------------------------
/static/fonts/.gitignore:
--------------------------------------------------------------------------------

```
1 | *
2 | !.gitignore
```

--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------

```
1 | latex
2 | pdfs
3 | references
```

--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------

```yaml
 1 | repos:
 2 | - repo: https://github.com/astral-sh/ruff-pre-commit
 3 |   # Ruff version.
 4 |   rev: v0.9.10
 5 |   hooks:
 6 |     # Run the linter.
 7 |     - id: ruff
 8 |       types_or: [ python, pyi ]
 9 |       args: [ --fix ]
10 |     # Run the formatter.
11 |     - id: ruff-format
12 |       types_or: [ python, pyi ]
```

--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------

```
  1 | private.py
  2 | .DS_Store
  3 | local.env
  4 | experiments
  5 | test_data
  6 | training
  7 | wandb
  8 | *.dat
  9 | report.json
 10 | benchmark_data
 11 | debug_data
 12 | temp.md
 13 | temp
 14 | conversion_results
 15 | uploads
 16 | /cache
 17 | 
 18 | # Byte-compiled / optimized / DLL files
 19 | __pycache__/
 20 | *.py[cod]
 21 | *$py.class
 22 | 
 23 | # C extensions
 24 | *.so
 25 | 
 26 | # Distribution / packaging
 27 | .Python
 28 | build/
 29 | develop-eggs/
 30 | dist/
 31 | downloads/
 32 | eggs/
 33 | .eggs/
 34 | lib/
 35 | lib64/
 36 | parts/
 37 | sdist/
 38 | var/
 39 | wheels/
 40 | share/python-wheels/
 41 | *.egg-info/
 42 | .installed.cfg
 43 | *.egg
 44 | MANIFEST
 45 | 
 46 | # PyInstaller
 47 | #  Usually these files are written by a python script from a template
 48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 49 | *.manifest
 50 | *.spec
 51 | 
 52 | # Installer logs
 53 | pip-log.txt
 54 | pip-delete-this-directory.txt
 55 | 
 56 | # Unit test / coverage reports
 57 | htmlcov/
 58 | .tox/
 59 | .nox/
 60 | .coverage
 61 | .coverage.*
 62 | .cache
 63 | nosetests.xml
 64 | coverage.xml
 65 | *.cover
 66 | *.py,cover
 67 | .hypothesis/
 68 | .pytest_cache/
 69 | cover/
 70 | 
 71 | # Translations
 72 | *.mo
 73 | *.pot
 74 | 
 75 | # Django stuff:
 76 | *.log
 77 | local_settings.py
 78 | db.sqlite3
 79 | db.sqlite3-journal
 80 | 
 81 | # Flask stuff:
 82 | instance/
 83 | .webassets-cache
 84 | 
 85 | # Scrapy stuff:
 86 | .scrapy
 87 | 
 88 | # Sphinx documentation
 89 | docs/_build/
 90 | 
 91 | # PyBuilder
 92 | .pybuilder/
 93 | target/
 94 | 
 95 | # Jupyter Notebook
 96 | .ipynb_checkpoints
 97 | 
 98 | # IPython
 99 | profile_default/
100 | ipython_config.py
101 | 
102 | # pyenv
103 | #   For a library or package, you might want to ignore these files since the code is
104 | #   intended to run in multiple environments; otherwise, check them in:
105 | # .python-version
106 | 
107 | # pipenv
108 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
109 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
110 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
111 | #   install all needed dependencies.
112 | #Pipfile.lock
113 | 
114 | # poetry
115 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
116 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
117 | #   commonly ignored for libraries.
118 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
119 | #poetry.lock
120 | 
121 | # pdm
122 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
123 | #pdm.lock
124 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
125 | #   in version control.
126 | #   https://pdm.fming.dev/#use-with-ide
127 | .pdm.toml
128 | 
129 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
130 | __pypackages__/
131 | 
132 | # Celery stuff
133 | celerybeat-schedule
134 | celerybeat.pid
135 | 
136 | # SageMath parsed files
137 | *.sage.py
138 | 
139 | # Environments
140 | .env
141 | .venv
142 | env/
143 | venv/
144 | ENV/
145 | env.bak/
146 | venv.bak/
147 | 
148 | # Spyder project settings
149 | .spyderproject
150 | .spyproject
151 | 
152 | # Rope project settings
153 | .ropeproject
154 | 
155 | # mkdocs documentation
156 | /site
157 | 
158 | # mypy
159 | .mypy_cache/
160 | .dmypy.json
161 | dmypy.json
162 | 
163 | # Pyre type checker
164 | .pyre/
165 | 
166 | # pytype static type analyzer
167 | .pytype/
168 | 
169 | # Cython debug symbols
170 | cython_debug/
171 | 
172 | # PyCharm
173 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
174 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
175 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
176 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
177 | .idea/
178 | 
179 | .vscode/
```

--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------

```markdown
 1 | ## Usage Examples
 2 | 
 3 | This directory contains examples of running `marker` in different contexts.
 4 | 
 5 | ### Usage with Modal
 6 | 
 7 | We have a [self-contained example](./marker_modal_deployment.py) that shows how you can quickly use [Modal](https://modal.com) to deploy `marker` by provisioning a container with a GPU, and expose that with an API so you can submit PDFs for conversion into Markdown, HTML, or JSON.
 8 | 
 9 | It's a limited example that you can extend into different use cases.
10 | 
11 | #### Pre-requisites
12 | 
13 | Make sure you have the `modal` client installed by [following their instructions here](https://modal.com/docs/guide#getting-started).
14 | 
15 | Modal's [Starter Plan](https://modal.com/pricing) includes $30 of free compute each month.
16 | Modal is [serverless](https://arxiv.org/abs/1902.03383), so you only pay for resources when you are using them.
17 | 
18 | #### Running the example
19 | 
20 | Once `modal` is configured, you can deploy it to your workspace by running:
21 | 
22 | > modal deploy marker_modal_deployment.py
23 | 
24 | Notes:
25 | - `marker` has a few models it uses. By default, the endpoint will check if these models are loaded and download them if not (first request will be slow). You can avoid this by running
26 | 
27 | > modal run marker_modal_deployment.py::download_models
28 | 
29 | Which will create a [`Modal Volume`](https://modal.com/docs/guide/Volumes) to store them for re-use.
30 | 
31 | Once the deploy is finished, you can:
32 | - Test a file upload locally through your CLI using an `invoke_conversion` command we expose through Modal's [`local_entrypoint`](https://modal.com/docs/reference/modal.App#local_entrypoint)
33 | - Get the URL of your endpoint and make a request through a client of your choice.
34 | 
35 | **Test from your CLI with `invoke_conversion`**
36 | 
37 | If your endpoint is live, simply run this command:
38 | 
39 | ```
40 | $ modal run marker_modal_deployment.py::invoke_conversion --pdf-file <PDF_FILE_PATH> --output-format markdown
41 | ```
42 | 
43 | And it'll automatically detect the URL of your new endpoint using [`.get_web_url()`](https://modal.com/docs/guide/webhook-urls#determine-the-url-of-a-web-endpoint-from-code), make sure it's healthy, submit your file, and store its output on your machine (in the same directory).
44 | 
45 | **Making a request using your own client**
46 | 
47 | If you want to make requests elsewhere e.g. with cURL or a client like Insomnia, you'll need to get the URL.
48 | 
49 | When your `modal deploy` command from earlier finishes, it'll include your endpoint URL at the end. For example:
50 | 
51 | ```
52 | $ modal deploy marker_modal_deployment.py
53 | ...
54 | ✓ Created objects.
55 | ├── 🔨 Created mount /marker/examples/marker_modal_deployment.py
56 | ├── 🔨 Created function download_models.
57 | ├── 🔨 Created function MarkerModalDemoService.*.
58 | └── 🔨 Created web endpoint for MarkerModalDemoService.fastapi_app => <YOUR_ENDPOINT_URL>
59 | ✓ App deployed in 149.877s! 🎉
60 | ```
61 | 
62 | If you accidentally close your terminal session, you can also always go into Modal's dashboard and:
63 |   - Find the app (default name: `datalab-marker-modal-demo`)
64 |   - Click on `MarkerModalDemoService`
65 |   - Find your endpoint URL
66 | 
67 | Once you have your URL, make a request to `{YOUR_ENDPOINT_URL}/convert` like this (you can also use Insomnia, etc.):
68 | ```
69 | curl --request POST \
70 |   --url {BASE_URL}/convert \
71 |   --header 'Content-Type: multipart/form-data' \
72 |   --form file=@/Users/cooldev/sample.pdf \
73 |   --form output_format=html
74 |   ```
75 | 
76 | You should get a response like this
77 | 
78 | ```
79 | {
80 | 	"success": true,
81 | 	"filename": "sample.pdf",
82 | 	"output_format": "html",
83 | 	"json": null,
84 | 	"html": "<YOUR_RESPONSE_CONTENT>",
85 | 	"markdown": null,
86 | 	"images": {},
87 | 	"metadata": {... page level metadata ...},
88 | 	"page_count": 2
89 | }
90 | ```
91 | 
92 | [Modal](https://modal.com) makes deploying and scaling models and inference workloads much easier.
93 | 
94 | If you're interested in Datalab's managed API or on-prem document intelligence solution, check out [our platform here](https://datalab.to/?utm_source=gh-marker).
95 | 
```

--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------

```markdown
  1 | # Marker
  2 | 
  3 | Marker converts documents to markdown, JSON, chunks, and HTML quickly and accurately.
  4 | 
  5 | - Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
  6 | - Formats tables, forms, equations, inline math, links, references, and code blocks
  7 | - Extracts and saves images
  8 | - Removes headers/footers/other artifacts
  9 | - Extensible with your own formatting and logic
 10 | - Does structured extraction, given a JSON schema (beta)
 11 | - Optionally boost accuracy with LLMs (and your own prompt)
 12 | - Works on GPU, CPU, or MPS
 13 | 
 14 | For our managed API or on-prem document intelligence solution, check out [our platform here](https://datalab.to?utm_source=gh-marker).
 15 | 
 16 | ## Performance
 17 | 
 18 | <img src="data/images/overall.png" width="800px"/>
 19 | 
 20 | Marker benchmarks favorably compared to cloud services like Llamaparse and Mathpix, as well as other open source tools.
 21 | 
 22 | The above results are running single PDF pages serially.  Marker is significantly faster when running in batch mode, with a projected throughput of 25 pages/second on an H100.
 23 | 
 24 | See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks.
 25 | 
 26 | ## Hybrid Mode
 27 | 
 28 | For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker.  This will do things like merge tables across pages, handle inline math, format tables properly, and extract values from forms.  It can use any gemini or ollama model.  By default, it uses `gemini-2.0-flash`.  See [below](#llm-services) for details.
 29 | 
 30 | Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm:
 31 | 
 32 | <img src="data/images/table.png" width="400px"/>
 33 | 
 34 | As you can see, the use_llm mode offers higher accuracy than marker or gemini alone.
 35 | 
 36 | ## Examples
 37 | 
 38 | | PDF | File type | Markdown                                                                                                                     | JSON                                                                                                   |
 39 | |-----|-----------|------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------|
 40 | | [Think Python](https://greenteapress.com/thinkpython/thinkpython.pdf) | Textbook | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/thinkpython/thinkpython.md)                 | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/thinkpython.json)         |
 41 | | [Switch Transformers](https://arxiv.org/pdf/2101.03961.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/switch_transformers/switch_trans.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/switch_trans.json) |
 42 | | [Multi-column CNN](https://arxiv.org/pdf/1804.07821.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/multicolcnn/multicolcnn.md)                 | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/multicolcnn.json)         |
 43 | 
 44 | # Commercial usage
 45 | 
 46 | Our model weights use a modified AI Pubs Open Rail-M license (free for research, personal use, and startups under $2M funding/revenue) and our code is GPL. For broader commercial licensing or to remove GPL requirements, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-marker).
 47 | 
 48 | # Hosted API & On-prem
 49 | 
 50 | There's a [hosted API](https://www.datalab.to?utm_source=gh-marker) and [painless on-prem solution](https://www.datalab.to/blog/self-serve-on-prem-licensing) for marker - it's free to sign up, and we'll throw in credits for you to test it out.
 51 | 
 52 | The API:
 53 | - Supports PDF, image, PPT, PPTX, DOC, DOCX, XLS, XLSX, HTML, EPUB files
 54 | - Is 1/4th the price of leading cloud-based competitors
 55 | - Fast - ~15s for a 250 page PDF
 56 | - Supports LLM mode
 57 | - High uptime (99.99%)
 58 | 
 59 | # Community
 60 | 
 61 | [Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development.
 62 | 
 63 | # Installation
 64 | 
 65 | You'll need python 3.10+ and [PyTorch](https://pytorch.org/get-started/locally/).
 66 | 
 67 | Install with:
 68 | 
 69 | ```shell
 70 | pip install marker-pdf
 71 | ```
 72 | 
 73 | If you want to use marker on documents other than PDFs, you will need to install additional dependencies with:
 74 | 
 75 | ```shell
 76 | pip install marker-pdf[full]
 77 | ```
 78 | 
 79 | # Usage
 80 | 
 81 | First, some configuration:
 82 | 
 83 | - Your torch device will be automatically detected, but you can override this.  For example, `TORCH_DEVICE=cuda`.
 84 | - Some PDFs, even digital ones, have bad text in them.  Set `--force_ocr` to force OCR on all lines, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text.
 85 | - If you care about inline math, set `force_ocr` to convert inline math to LaTeX.
 86 | 
 87 | ## Interactive App
 88 | 
 89 | I've included a streamlit app that lets you interactively try marker with some basic options.  Run it with:
 90 | 
 91 | ```shell
 92 | pip install streamlit streamlit-ace
 93 | marker_gui
 94 | ```
 95 | 
 96 | ## Convert a single file
 97 | 
 98 | ```shell
 99 | marker_single /path/to/file.pdf
100 | ```
101 | 
102 | You can pass in PDFs or images.
103 | 
104 | Options:
105 | - `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
106 | - `--output_format [markdown|json|html|chunks]`: Specify the format for the output results.
107 | - `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
108 | - `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n`
109 | - `--use_llm`: Uses an LLM to improve accuracy.  You will need to configure the LLM backend - see [below](#llm-services).
110 | - `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text.  This will also format inline math properly.
111 | - `--block_correction_prompt`: if LLM mode is active, an optional prompt that will be used to correct the output of marker.  This is useful for custom formatting or logic that you want to apply to the output.
112 | - `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya.
113 | - `--redo_inline_math`: If you want the absolute highest quality inline math conversion, use this along with `--use_llm`.
114 | - `--disable_image_extraction`: Don't extract images from the PDF.  If you also specify `--use_llm`, then images will be replaced with a description.
115 | - `--debug`: Enable debug mode for additional logging and diagnostic information.
116 | - `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"`
117 | - `--config_json PATH`: Path to a JSON configuration file containing additional settings.
118 | - `config --help`: List all available builders, processors, and converters, and their associated configuration.  These values can be used to build a JSON configuration file for additional tweaking of marker defaults.
119 | - `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`.  The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables.
120 | - `--llm_service`: Which llm service to use if `--use_llm` is passed.  This defaults to `marker.services.gemini.GoogleGeminiService`.
121 | - `--help`: see all of the flags that can be passed into marker.  (it supports many more options then are listed above)
122 | 
123 | The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py).  If you don't need OCR, marker can work with any language.
124 | 
125 | ## Convert multiple files
126 | 
127 | ```shell
128 | marker /path/to/input/folder
129 | ```
130 | 
131 | - `marker` supports all the same options from `marker_single` above.
132 | - `--workers` is the number of conversion workers to run simultaneously.  This is automatically set by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage.  Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average.
133 | 
134 | ## Convert multiple files on multiple GPUs
135 | 
136 | ```shell
137 | NUM_DEVICES=4 NUM_WORKERS=15 marker_chunk_convert ../pdf_in ../md_out
138 | ```
139 | 
140 | - `NUM_DEVICES` is the number of GPUs to use.  Should be `2` or greater.
141 | - `NUM_WORKERS` is the number of parallel processes to run on each GPU.
142 | 
143 | ## Use from python
144 | 
145 | See the `PdfConverter` class at `marker/converters/pdf.py` function for additional arguments that can be passed.
146 | 
147 | ```python
148 | from marker.converters.pdf import PdfConverter
149 | from marker.models import create_model_dict
150 | from marker.output import text_from_rendered
151 | 
152 | converter = PdfConverter(
153 |     artifact_dict=create_model_dict(),
154 | )
155 | rendered = converter("FILEPATH")
156 | text, _, images = text_from_rendered(rendered)
157 | ```
158 | 
159 | `rendered` will be a pydantic basemodel with different properties depending on the output type requested.  With markdown output (default), you'll have the properties `markdown`, `metadata`, and `images`.  For json output, you'll have `children`, `block_type`, and `metadata`.
160 | 
161 | ### Custom configuration
162 | 
163 | You can pass configuration using the `ConfigParser`.  To see all available options, do `marker_single --help`.
164 | 
165 | ```python
166 | from marker.converters.pdf import PdfConverter
167 | from marker.models import create_model_dict
168 | from marker.config.parser import ConfigParser
169 | 
170 | config = {
171 |     "output_format": "json",
172 |     "ADDITIONAL_KEY": "VALUE"
173 | }
174 | config_parser = ConfigParser(config)
175 | 
176 | converter = PdfConverter(
177 |     config=config_parser.generate_config_dict(),
178 |     artifact_dict=create_model_dict(),
179 |     processor_list=config_parser.get_processors(),
180 |     renderer=config_parser.get_renderer(),
181 |     llm_service=config_parser.get_llm_service()
182 | )
183 | rendered = converter("FILEPATH")
184 | ```
185 | 
186 | ### Extract blocks
187 | 
188 | Each document consists of one or more pages.  Pages contain blocks, which can themselves contain other blocks.  It's possible to programmatically manipulate these blocks.
189 | 
190 | Here's an example of extracting all forms from a document:
191 | 
192 | ```python
193 | from marker.converters.pdf import PdfConverter
194 | from marker.models import create_model_dict
195 | from marker.schema import BlockTypes
196 | 
197 | converter = PdfConverter(
198 |     artifact_dict=create_model_dict(),
199 | )
200 | document = converter.build_document("FILEPATH")
201 | forms = document.contained_blocks((BlockTypes.Form,))
202 | ```
203 | 
204 | Look at the processors for more examples of extracting and manipulating blocks.
205 | 
206 | ## Other converters
207 | 
208 | You can also use other converters that define different conversion pipelines:
209 | 
210 | ### Extract tables
211 | 
212 | The `TableConverter` will only convert and extract tables:
213 | 
214 | ```python
215 | from marker.converters.table import TableConverter
216 | from marker.models import create_model_dict
217 | from marker.output import text_from_rendered
218 | 
219 | converter = TableConverter(
220 |     artifact_dict=create_model_dict(),
221 | )
222 | rendered = converter("FILEPATH")
223 | text, _, images = text_from_rendered(rendered)
224 | ```
225 | 
226 | This takes all the same configuration as the PdfConverter.  You can specify the configuration `force_layout_block=Table` to avoid layout detection and instead assume every page is a table.  Set `output_format=json` to also get cell bounding boxes.
227 | 
228 | You can also run this via the CLI with
229 | ```shell
230 | marker_single FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter --output_format json
231 | ```
232 | 
233 | ### OCR Only
234 | 
235 | If you only want to run OCR, you can also do that through the `OCRConverter`.  Set `--keep_chars` to keep individual characters and bounding boxes.
236 | 
237 | ```python
238 | from marker.converters.ocr import OCRConverter
239 | from marker.models import create_model_dict
240 | 
241 | converter = OCRConverter(
242 |     artifact_dict=create_model_dict(),
243 | )
244 | rendered = converter("FILEPATH")
245 | ```
246 | 
247 | This takes all the same configuration as the PdfConverter.
248 | 
249 | You can also run this via the CLI with
250 | ```shell
251 | marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter
252 | ```
253 | 
254 | ### Structured Extraction (beta)
255 | 
256 | You can run structured extraction via the `ExtractionConverter`.  This requires an llm service to be setup first (see [here](#llm-services) for details).  You'll get a JSON output with the extracted values.
257 | 
258 | ```python
259 | from marker.converters.extraction import ExtractionConverter
260 | from marker.models import create_model_dict
261 | from marker.config.parser import ConfigParser
262 | from pydantic import BaseModel
263 | 
264 | class Links(BaseModel):
265 |     links: list[str]
266 | 
267 | schema = Links.model_json_schema()
268 | config_parser = ConfigParser({
269 |     "page_schema": schema
270 | })
271 | 
272 | converter = ExtractionConverter(
273 |     artifact_dict=create_model_dict(),
274 |     config=config_parser.generate_config_dict(),
275 |     llm_service=config_parser.get_llm_service(),
276 | )
277 | rendered = converter("FILEPATH")
278 | ```
279 | 
280 | Rendered will have an `original_markdown` field.  If you pass this back in next time you run the converter, as the `existing_markdown` config key, you can skip re-parsing the document.
281 | 
282 | # Output Formats
283 | 
284 | ## Markdown
285 | 
286 | Markdown output will include:
287 | 
288 | - image links (images will be saved in the same folder)
289 | - formatted tables
290 | - embedded LaTeX equations (fenced with `$$`)
291 | - Code is fenced with triple backticks
292 | - Superscripts for footnotes
293 | 
294 | ## HTML
295 | 
296 | HTML output is similar to markdown output:
297 | 
298 | - Images are included via `img` tags
299 | - equations are fenced with `<math>` tags
300 | - code is in `pre` tags
301 | 
302 | ## JSON
303 | 
304 | JSON output will be organized in a tree-like structure, with the leaf nodes being blocks.  Examples of leaf nodes are a single list item, a paragraph of text, or an image.
305 | 
306 | The output will be a list, with each list item representing a page.  Each page is considered a block in the internal marker schema.  There are different types of blocks to represent different elements.
307 | 
308 | Pages have the keys:
309 | 
310 | - `id` - unique id for the block.
311 | - `block_type` - the type of block. The possible block types can be seen in `marker/schema/__init__.py`.  As of this writing, they are ["Line", "Span", "FigureGroup", "TableGroup", "ListGroup", "PictureGroup", "Page", "Caption", "Code", "Figure", "Footnote", "Form", "Equation", "Handwriting", "TextInlineMath", "ListItem", "PageFooter", "PageHeader", "Picture", "SectionHeader", "Table", "Text", "TableOfContents", "Document"]
312 | - `html` - the HTML for the page.  Note that this will have recursive references to children.  The `content-ref` tags must be replaced with the child content if you want the full html.  You can see an example of this at `marker/output.py:json_to_html`.  That function will take in a single block from the json output, and turn it into HTML.
313 | - `polygon` - the 4-corner polygon of the page, in (x1,y1), (x2,y2), (x3, y3), (x4, y4) format.  (x1,y1) is the top left, and coordinates go clockwise.
314 | - `children` - the child blocks.
315 | 
316 | The child blocks have two additional keys:
317 | 
318 | - `section_hierarchy` - indicates the sections that the block is part of.  `1` indicates an h1 tag, `2` an h2, and so on.
319 | - `images` - base64 encoded images.  The key will be the block id, and the data will be the encoded image.
320 | 
321 | Note that child blocks of pages can have their own children as well (a tree structure).
322 | 
323 | ```json
324 | {
325 |       "id": "/page/10/Page/366",
326 |       "block_type": "Page",
327 |       "html": "<content-ref src='/page/10/SectionHeader/0'></content-ref><content-ref src='/page/10/SectionHeader/1'></content-ref><content-ref src='/page/10/Text/2'></content-ref><content-ref src='/page/10/Text/3'></content-ref><content-ref src='/page/10/Figure/4'></content-ref><content-ref src='/page/10/SectionHeader/5'></content-ref><content-ref src='/page/10/SectionHeader/6'></content-ref><content-ref src='/page/10/TextInlineMath/7'></content-ref><content-ref src='/page/10/TextInlineMath/8'></content-ref><content-ref src='/page/10/Table/9'></content-ref><content-ref src='/page/10/SectionHeader/10'></content-ref><content-ref src='/page/10/Text/11'></content-ref>",
328 |       "polygon": [[0.0, 0.0], [612.0, 0.0], [612.0, 792.0], [0.0, 792.0]],
329 |       "children": [
330 |         {
331 |           "id": "/page/10/SectionHeader/0",
332 |           "block_type": "SectionHeader",
333 |           "html": "<h1>Supplementary Material for <i>Subspace Adversarial Training</i> </h1>",
334 |           "polygon": [
335 |             [217.845703125, 80.630859375], [374.73046875, 80.630859375],
336 |             [374.73046875, 107.0],
337 |             [217.845703125, 107.0]
338 |           ],
339 |           "children": null,
340 |           "section_hierarchy": {
341 |             "1": "/page/10/SectionHeader/1"
342 |           },
343 |           "images": {}
344 |         },
345 |         ...
346 |         ]
347 |     }
348 | 
349 | 
350 | ```
351 | 
352 | ## Chunks
353 | 
354 | Chunks format is similar to JSON, but flattens everything into a single list instead of a tree.  Only the top level blocks from each page show up. It also has the full HTML of each block inside, so you don't need to crawl the tree to reconstruct it.  This enable flexible and easy chunking for RAG.
355 | 
356 | ## Metadata
357 | 
358 | All output formats will return a metadata dictionary, with the following fields:
359 | 
360 | ```json
361 | {
362 |     "table_of_contents": [
363 |       {
364 |         "title": "Introduction",
365 |         "heading_level": 1,
366 |         "page_id": 0,
367 |         "polygon": [...]
368 |       }
369 |     ], // computed PDF table of contents
370 |     "page_stats": [
371 |       {
372 |         "page_id":  0,
373 |         "text_extraction_method": "pdftext",
374 |         "block_counts": [("Span", 200), ...]
375 |       },
376 |       ...
377 |     ]
378 | }
379 | ```
380 | 
381 | # LLM Services
382 | 
383 | When running with the `--use_llm` flag, you have a choice of services you can use:
384 | 
385 | - `Gemini` - this will use the Gemini developer API by default.  You'll need to pass `--gemini_api_key` to configuration.
386 | - `Google Vertex` - this will use vertex, which can be more reliable.  You'll need to pass `--vertex_project_id`.  To use it, set `--llm_service=marker.services.vertex.GoogleVertexService`.
387 | - `Ollama` - this will use local models.  You can configure `--ollama_base_url` and `--ollama_model`. To use it, set `--llm_service=marker.services.ollama.OllamaService`.
388 | - `Claude` - this will use the anthropic API.  You can configure `--claude_api_key`, and `--claude_model_name`.  To use it, set `--llm_service=marker.services.claude.ClaudeService`.
389 | - `OpenAI` - this supports any openai-like endpoint. You can configure `--openai_api_key`, `--openai_model`, and `--openai_base_url`. To use it, set `--llm_service=marker.services.openai.OpenAIService`.
390 | - `Azure OpenAI` - this uses the Azure OpenAI service. You can configure `--azure_endpoint`, `--azure_api_key`, and `--deployment_name`. To use it, set `--llm_service=marker.services.azure_openai.AzureOpenAIService`.
391 | 
392 | These services may have additional optional configuration as well - you can see it by viewing the classes.
393 | 
394 | # Internals
395 | 
396 | Marker is easy to extend.  The core units of marker are:
397 | 
398 | - `Providers`, at `marker/providers`.  These provide information from a source file, like a PDF.
399 | - `Builders`, at `marker/builders`.  These generate the initial document blocks and fill in text, using info from the providers.
400 | - `Processors`, at `marker/processors`.  These process specific blocks, for example the table formatter is a processor.
401 | - `Renderers`, at `marker/renderers`. These use the blocks to render output.
402 | - `Schema`, at `marker/schema`.  The classes for all the block types.
403 | - `Converters`, at `marker/converters`.  They run the whole end to end pipeline.
404 | 
405 | To customize processing behavior, override the `processors`.  To add new output formats, write a new `renderer`.  For additional input formats, write a new `provider.`
406 | 
407 | Processors and renderers can be directly passed into the base `PDFConverter`, so you can specify your own custom processing easily.
408 | 
409 | ## API server
410 | 
411 | There is a very simple API server you can run like this:
412 | 
413 | ```shell
414 | pip install -U uvicorn fastapi python-multipart
415 | marker_server --port 8001
416 | ```
417 | 
418 | This will start a fastapi server that you can access at `localhost:8001`.  You can go to `localhost:8001/docs` to see the endpoint options.
419 | 
420 | You can send requests like this:
421 | 
422 | ```
423 | import requests
424 | import json
425 | 
426 | post_data = {
427 |     'filepath': 'FILEPATH',
428 |     # Add other params here
429 | }
430 | 
431 | requests.post("http://localhost:8001/marker", data=json.dumps(post_data)).json()
432 | ```
433 | 
434 | Note that this is not a very robust API, and is only intended for small-scale use.  If you want to use this server, but want a more robust conversion option, you can use the hosted [Datalab API](https://www.datalab.to/plans).
435 | 
436 | # Troubleshooting
437 | 
438 | There are some settings that you may find useful if things aren't working the way you expect:
439 | 
440 | - If you have issues with accuracy, try setting `--use_llm` to use an LLM to improve quality.  You must set `GOOGLE_API_KEY` to a Gemini API key for this to work.
441 | - Make sure to set `force_ocr` if you see garbled text - this will re-OCR the document.
442 | - `TORCH_DEVICE` - set this to force marker to use a given torch device for inference.
443 | - If you're getting out of memory errors, decrease worker count.  You can also try splitting up long PDFs into multiple files.
444 | 
445 | ## Debugging
446 | 
447 | Pass the `debug` option to activate debug mode.  This will save images of each page with detected layout and text, as well as output a json file with additional bounding box information.
448 | 
449 | # Benchmarks
450 | 
451 | ## Overall PDF Conversion
452 | 
453 | We created a [benchmark set](https://huggingface.co/datasets/datalab-to/marker_benchmark) by extracting single PDF pages from common crawl.  We scored based on a heuristic that aligns text with ground truth text segments, and an LLM as a judge scoring method.
454 | 
455 | | Method     | Avg Time | Heuristic Score | LLM Score |
456 | |------------|----------|-----------------|-----------|
457 | | marker     | 2.83837  | 95.6709         | 4.23916   |
458 | | llamaparse | 23.348   | 84.2442         | 3.97619   |
459 | | mathpix    | 6.36223  | 86.4281         | 4.15626   |
460 | | docling    | 3.69949  | 86.7073         | 3.70429   |
461 | 
462 | Benchmarks were run on an H100 for markjer and docling - llamaparse and mathpix used their cloud services.  We can also look at it by document type:
463 | 
464 | <img src="data/images/per_doc.png" width="1000px"/>
465 | 
466 | | Document Type        | Marker heuristic | Marker LLM | Llamaparse Heuristic | Llamaparse LLM | Mathpix Heuristic | Mathpix LLM | Docling Heuristic | Docling LLM |
467 | |----------------------|------------------|------------|----------------------|----------------|-------------------|-------------|-------------------|-------------|
468 | | Scientific paper     | 96.6737          | 4.34899    | 87.1651              | 3.96421        | 91.2267           | 4.46861     | 92.135            | 3.72422     |
469 | | Book page            | 97.1846          | 4.16168    | 90.9532              | 4.07186        | 93.8886           | 4.35329     | 90.0556           | 3.64671     |
470 | | Other                | 95.1632          | 4.25076    | 81.1385              | 4.01835        | 79.6231           | 4.00306     | 83.8223           | 3.76147     |
471 | | Form                 | 88.0147          | 3.84663    | 66.3081              | 3.68712        | 64.7512           | 3.33129     | 68.3857           | 3.40491     |
472 | | Presentation         | 95.1562          | 4.13669    | 81.2261              | 4              | 83.6737           | 3.95683     | 84.8405           | 3.86331     |
473 | | Financial document   | 95.3697          | 4.39106    | 82.5812              | 4.16111        | 81.3115           | 4.05556     | 86.3882           | 3.8         |
474 | | Letter               | 98.4021          | 4.5        | 93.4477              | 4.28125        | 96.0383           | 4.45312     | 92.0952           | 4.09375     |
475 | | Engineering document | 93.9244          | 4.04412    | 77.4854              | 3.72059        | 80.3319           | 3.88235     | 79.6807           | 3.42647     |
476 | | Legal document       | 96.689           | 4.27759    | 86.9769              | 3.87584        | 91.601            | 4.20805     | 87.8383           | 3.65552     |
477 | | Newspaper page       | 98.8733          | 4.25806    | 84.7492              | 3.90323        | 96.9963           | 4.45161     | 92.6496           | 3.51613     |
478 | | Magazine page        | 98.2145          | 4.38776    | 87.2902              | 3.97959        | 93.5934           | 4.16327     | 93.0892           | 4.02041     |
479 | 
480 | ## Throughput
481 | 
482 | We benchmarked throughput using a [single long PDF](https://www.greenteapress.com/thinkpython/thinkpython.pdf).
483 | 
484 | | Method  | Time per page | Time per document | VRAM used |
485 | |---------|---------------|-------------------|---------- |
486 | | marker  | 0.18          | 43.42             |  3.17GB   |
487 | 
488 | The projected throughput is 122 pages per second on an H100 - we can run 22 individual processes given the VRAM used.
489 | 
490 | ## Table Conversion
491 | 
492 | Marker can extract tables from PDFs using `marker.converters.table.TableConverter`. The table extraction performance is measured by comparing the extracted HTML representation of tables against the original HTML representations using the test split of [FinTabNet](https://developer.ibm.com/exchanges/data/all/fintabnet/). The HTML representations are compared using a tree edit distance based metric to judge both structure and content. Marker detects and identifies the structure of all tables in a PDF page and achieves these scores:
493 | 
494 | | Method           | Avg score | Total tables |
495 | |------------------|-----------|--------------|
496 | | marker           | 0.816     | 99           |
497 | | marker w/use_llm | 0.907     | 99           |
498 | | gemini           | 0.829     | 99           |
499 | 
500 | The `--use_llm` flag can significantly improve table recognition performance, as you can see.
501 | 
502 | We filter out tables that we cannot align with the ground truth, since fintabnet and our layout model have slightly different detection methods (this results in some tables being split/merged).
503 | 
504 | ## Running your own benchmarks
505 | 
506 | You can benchmark the performance of marker on your machine. Install marker manually with:
507 | 
508 | ```shell
509 | git clone https://github.com/VikParuchuri/marker.git
510 | poetry install
511 | ```
512 | 
513 | ### Overall PDF Conversion
514 | 
515 | Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this:
516 | 
517 | ```shell
518 | python benchmarks/overall.py --methods marker --scores heuristic,llm
519 | ```
520 | 
521 | Options:
522 | 
523 | - `--use_llm` use an llm to improve the marker results.
524 | - `--max_rows` how many rows to process for the benchmark.
525 | - `--methods` can be `llamaparse`, `mathpix`, `docling`, `marker`.  Comma separated.
526 | - `--scores` which scoring functions to use, can be `llm`, `heuristic`.  Comma separated.
527 | 
528 | ### Table Conversion
529 | The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with:
530 | 
531 | ```shell
532 | python benchmarks/table/table.py --max_rows 100
533 | ```
534 | 
535 | Options:
536 | 
537 | - `--use_llm` uses an llm with marker to improve accuracy.
538 | - `--use_gemini` also benchmarks gemini 2.0 flash.
539 | 
540 | # How it works
541 | 
542 | Marker is a pipeline of deep learning models:
543 | 
544 | - Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya))
545 | - Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya))
546 | - Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify), [surya](https://github.com/VikParuchuri/surya))
547 | - Optionally use an LLM to improve quality
548 | - Combine blocks and postprocess complete text
549 | 
550 | It only uses models where necessary, which improves speed and accuracy.
551 | 
552 | # Limitations
553 | 
554 | PDF is a tricky format, so marker will not always work perfectly.  Here are some known limitations that are on the roadmap to address:
555 | 
556 | - Very complex layouts, with nested tables and forms, may not work
557 | - Forms may not be rendered well
558 | 
559 | Note: Passing the `--use_llm` and `--force_ocr` flags will mostly solve these issues.
560 | 
561 | # Usage and Deployment Examples
562 | 
563 | You can always run `marker` locally, but if you wanted to expose it as an API, we have a few options:
564 | - Our platform API which is powered by `marker` and `surya` and is easy to test out - it's free to sign up, and we'll include credits, [try it out here](https://datalab.to)
565 | - Our painless on-prem solution for commercial use, which you can [read about here](https://www.datalab.to/blog/self-serve-on-prem-licensing) and gives you privacy guarantees with high throughput inference optimizations.
566 | - [Deployment example with Modal](./examples/README_MODAL.md) that shows you how to deploy and access `marker` through a web endpoint using [`Modal`](https://modal.com). Modal is an AI compute platform that enables developers to deploy and scale models on GPUs in minutes.
567 | 
```

--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/display/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/download/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/benchmarks/table/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/benchmarks/throughput/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/marker/config/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/marker/scripts/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/marker/utils/__init__.py:
--------------------------------------------------------------------------------

```python
1 | 
```

--------------------------------------------------------------------------------
/marker/schema/groups/base.py:
--------------------------------------------------------------------------------

```python
1 | from marker.schema.blocks import Block
2 | 
3 | 
4 | class Group(Block):
5 |     pass
```

--------------------------------------------------------------------------------
/marker/schema/text/__init__.py:
--------------------------------------------------------------------------------

```python
1 | from marker.schema.text.line import Line
2 | from marker.schema.text.span import Span
3 | 
```

--------------------------------------------------------------------------------
/marker_server.py:
--------------------------------------------------------------------------------

```python
1 | from marker.scripts.server import server_cli
2 | 
3 | if __name__ == "__main__":
4 |     server_cli()
5 | 
```

--------------------------------------------------------------------------------
/convert.py:
--------------------------------------------------------------------------------

```python
1 | from marker.scripts.convert import convert_cli
2 | 
3 | if __name__ == "__main__":
4 |     convert_cli()
5 | 
```

--------------------------------------------------------------------------------
/chunk_convert.py:
--------------------------------------------------------------------------------

```python
1 | from marker.scripts.chunk_convert import chunk_convert_cli
2 | 
3 | if __name__ == "__main__":
4 |     chunk_convert_cli()
```

--------------------------------------------------------------------------------
/convert_single.py:
--------------------------------------------------------------------------------

```python
1 | from marker.scripts.convert_single import convert_single_cli
2 | 
3 | if __name__ == "__main__":
4 |     convert_single_cli()
5 | 
```

--------------------------------------------------------------------------------
/marker_app.py:
--------------------------------------------------------------------------------

```python
1 | from marker.scripts.run_streamlit_app import streamlit_app_cli
2 | 
3 | if __name__ == "__main__":
4 |     streamlit_app_cli()
```

--------------------------------------------------------------------------------
/extraction_app.py:
--------------------------------------------------------------------------------

```python
1 | from marker.scripts.run_streamlit_app import extraction_app_cli
2 | 
3 | if __name__ == "__main__":
4 |     extraction_app_cli()
5 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/schema.py:
--------------------------------------------------------------------------------

```python
1 | from typing import TypedDict, List
2 | 
3 | 
4 | class BenchmarkResult(TypedDict):
5 |     markdown: str | List[str]
6 |     time: float | None
```

--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------

```
1 | [pytest]
2 | testpaths=tests
3 | markers =
4 |     filename(name): specify the filename for the pdf_document fixture
5 | filterwarnings =
6 |     ignore::Warning
```

--------------------------------------------------------------------------------
/benchmarks/overall/scorers/schema.py:
--------------------------------------------------------------------------------

```python
1 | from typing import TypedDict, List, Optional, Dict
2 | 
3 | 
4 | class BlockScores(TypedDict):
5 |     score: float
6 |     specific_scores: Dict[str, float | List[float]]
7 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/toc.py:
--------------------------------------------------------------------------------

```python
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks.basetable import BaseTable
3 | 
4 | 
5 | class TableOfContents(BaseTable):
6 |     block_type: str = BlockTypes.TableOfContents
7 |     block_description: str = "A table of contents."
8 | 
```

--------------------------------------------------------------------------------
/marker/schema/text/char.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.blocks import Block
 3 | 
 4 | 
 5 | class Char(Block):
 6 |     block_type: BlockTypes = BlockTypes.Char
 7 |     block_description: str = "A single character inside a span."
 8 | 
 9 |     text: str
10 |     idx: int
11 | 
```

--------------------------------------------------------------------------------
/marker/providers/utils.py:
--------------------------------------------------------------------------------

```python
 1 | def alphanum_ratio(text):
 2 |     text = text.replace(" ", "")
 3 |     text = text.replace("\n", "")
 4 |     alphanumeric_count = sum([1 for c in text if c.isalnum()])
 5 | 
 6 |     if len(text) == 0:
 7 |         return 1
 8 | 
 9 |     ratio = alphanumeric_count / len(text)
10 |     return ratio
11 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/table.py:
--------------------------------------------------------------------------------

```python
1 | from marker.schema import BlockTypes
2 | from marker.schema.blocks.basetable import BaseTable
3 | 
4 | 
5 | class Table(BaseTable):
6 |     block_type: BlockTypes = BlockTypes.Table
7 |     block_description: str = "A table of data, like a results table.  It will be in a tabular format."
8 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/scorers/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import List
 2 | 
 3 | from benchmarks.overall.scorers.schema import BlockScores
 4 | 
 5 | 
 6 | class BaseScorer:
 7 |     def __init__(self):
 8 |         pass
 9 | 
10 |     def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores:
11 |         raise NotImplementedError()
```

--------------------------------------------------------------------------------
/tests/builders/test_structure.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.builders.structure import StructureBuilder
 4 | 
 5 | 
 6 | @pytest.mark.config({"page_range": [0]})
 7 | def test_structure_builder(pdf_document):
 8 |     structure = StructureBuilder()
 9 |     structure(pdf_document)
10 |     assert len(pdf_document.pages[0].structure) > 0
11 | 
```

--------------------------------------------------------------------------------
/marker/schema/groups/__init__.py:
--------------------------------------------------------------------------------

```python
1 | from marker.schema.blocks.base import Block
2 | from marker.schema.groups.figure import FigureGroup
3 | from marker.schema.groups.table import TableGroup
4 | from marker.schema.groups.list import ListGroup
5 | from marker.schema.groups.picture import PictureGroup
6 | from marker.schema.groups.page import PageGroup
7 | 
```

--------------------------------------------------------------------------------
/marker/builders/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Optional
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | from marker.util import assign_config
 6 | 
 7 | 
 8 | class BaseBuilder:
 9 |     def __init__(self, config: Optional[BaseModel | dict] = None):
10 |         assign_config(self, config)
11 | 
12 |     def __call__(self, data, *args, **kwargs):
13 |         raise NotImplementedError
14 | 
```

--------------------------------------------------------------------------------
/tests/builders/test_ocr_builder.py:
--------------------------------------------------------------------------------

```python
 1 | from PIL import Image
 2 | 
 3 | from marker.builders.ocr import OcrBuilder
 4 | 
 5 | 
 6 | def test_blank_char_builder(recognition_model):
 7 |     builder = OcrBuilder(recognition_model)
 8 |     image = Image.new("RGB", (100, 100))
 9 |     spans = builder.spans_from_html_chars([], None, image)  # Test with empty char list
10 |     assert len(spans) == 0
11 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/form.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import List
 2 | 
 3 | from marker.schema import BlockTypes
 4 | from marker.schema.blocks.basetable import BaseTable
 5 | 
 6 | 
 7 | class Form(BaseTable):
 8 |     block_type: BlockTypes = BlockTypes.Form
 9 |     block_description: str = "A form, such as a tax form, that contains fields and labels.  It most likely doesn't have a table structure."
10 | 
```

--------------------------------------------------------------------------------
/tests/renderers/test_json_renderer.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.renderers.json import JSONRenderer
 4 | 
 5 | 
 6 | @pytest.mark.config({"page_range": [0]})
 7 | def test_markdown_renderer_pagination(pdf_document):
 8 |     renderer = JSONRenderer()
 9 |     pages = renderer(pdf_document).children
10 | 
11 |     assert len(pages) == 1
12 |     assert pages[0].block_type == "Page"
13 |     assert pages[0].children[0].block_type == "SectionHeader"
```

--------------------------------------------------------------------------------
/benchmarks/overall/schema.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import TypedDict, List, Dict
 2 | 
 3 | from benchmarks.overall.scorers.schema import BlockScores
 4 | 
 5 | AVG_TYPE = Dict[str, Dict[str, Dict[str, List[float]]]]
 6 | 
 7 | class FullResult(TypedDict):
 8 |     scores: Dict[int, Dict[str, Dict[str, BlockScores]]]
 9 |     averages_by_type: AVG_TYPE
10 |     averages_by_block_type: AVG_TYPE
11 |     average_times: Dict[str, List[float]]
12 |     markdown: Dict[int, Dict[str, str]]
13 | 
```

--------------------------------------------------------------------------------
/tests/builders/test_strip_existing_ocr.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.config({"page_range": [0], "strip_existing_ocr": True})
 5 | @pytest.mark.filename("handwritten.pdf")
 6 | def test_strip_ocr(doc_provider):
 7 |     # Ensure that the OCR text isn't extracted
 8 |     assert len(doc_provider.page_lines) == 0
 9 | 
10 | 
11 | @pytest.mark.config({"page_range": [0]})
12 | @pytest.mark.filename("handwritten.pdf")
13 | def test_keep_ocr(doc_provider):
14 |     assert len(doc_provider.page_lines) == 1
15 | 
```

--------------------------------------------------------------------------------
/tests/renderers/test_html_renderer.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.renderers.html import HTMLRenderer
 4 | 
 5 | 
 6 | @pytest.mark.config(
 7 |     {
 8 |         "page_range": [0],
 9 |         "disable_ocr": True,
10 |         "add_block_ids": True,
11 |         "paginate_output": True,
12 |     }
13 | )
14 | def test_html_renderer_block_ids(pdf_document, config):
15 |     renderer = HTMLRenderer(config)
16 |     html = renderer(pdf_document).html
17 | 
18 |     # Verify some block IDs are present
19 |     assert "/page/0/Text/1" in html
20 | 
```

--------------------------------------------------------------------------------
/tests/processors/test_document_toc_processor.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.processors.document_toc import DocumentTOCProcessor
 4 | 
 5 | 
 6 | @pytest.mark.config({"page_range": [0]})
 7 | def test_document_toc_processor(pdf_document, detection_model, recognition_model, table_rec_model):
 8 |     processor = DocumentTOCProcessor()
 9 |     processor(pdf_document)
10 | 
11 |     assert len(pdf_document.table_of_contents) == 4
12 |     assert pdf_document.table_of_contents[0]["title"] == "Subspace Adversarial Training"
13 | 
```

--------------------------------------------------------------------------------
/tests/processors/test_equation_processor.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.schema import BlockTypes
 4 | from marker.processors.equation import EquationProcessor
 5 | 
 6 | 
 7 | @pytest.mark.config({"page_range": [0]})
 8 | def test_equation_processor(pdf_document, recognition_model):
 9 |     processor = EquationProcessor(recognition_model)
10 |     processor(pdf_document)
11 | 
12 |     for block in pdf_document.pages[0].children:
13 |         if block.block_type == BlockTypes.Equation:
14 |             assert block.html is not None
```

--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.providers.pdf import PdfProvider
 2 | import tempfile
 3 | 
 4 | import datasets
 5 | 
 6 | 
 7 | def setup_pdf_provider(
 8 |     filename='adversarial.pdf',
 9 |     config=None,
10 | ) -> PdfProvider:
11 |     dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
12 |     idx = dataset['filename'].index(filename)
13 | 
14 |     temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
15 |     temp_pdf.write(dataset['pdf'][idx])
16 |     temp_pdf.flush()
17 | 
18 |     provider = PdfProvider(temp_pdf.name, config)
19 |     return provider
20 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/code.py:
--------------------------------------------------------------------------------

```python
 1 | import html
 2 | 
 3 | from marker.schema import BlockTypes
 4 | from marker.schema.blocks import Block
 5 | 
 6 | 
 7 | class Code(Block):
 8 |     block_type: BlockTypes = BlockTypes.Code
 9 |     code: str | None = None
10 |     html: str | None = None
11 |     block_description: str = "A programming code block."
12 | 
13 |     def assemble_html(self, document, child_blocks, parent_structure, block_config):
14 |         if self.html:
15 |             return self.html
16 |         code = self.code or ""
17 |         return f"<pre>{html.escape(code)}</pre>"
18 | 
```

--------------------------------------------------------------------------------
/marker/processors/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Optional, Tuple
 2 | 
 3 | from pydantic import BaseModel
 4 | 
 5 | from marker.schema import BlockTypes
 6 | from marker.schema.document import Document
 7 | from marker.util import assign_config
 8 | 
 9 | 
10 | class BaseProcessor:
11 |     block_types: Tuple[BlockTypes] | None = None  # What block types this processor is responsible for
12 | 
13 |     def __init__(self, config: Optional[BaseModel | dict] = None):
14 |         assign_config(self, config)
15 | 
16 |     def __call__(self, document: Document, *args, **kwargs):
17 |         raise NotImplementedError
18 | 
```

--------------------------------------------------------------------------------
/tests/processors/test_footnote_processor.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.processors.footnote import FootnoteProcessor
 4 | from marker.schema import BlockTypes
 5 | 
 6 | 
 7 | @pytest.mark.filename("population_stats.pdf")
 8 | @pytest.mark.config({"page_range": [4]})
 9 | def test_footnote_processor(pdf_document):
10 |     processor = FootnoteProcessor()
11 |     processor(pdf_document)
12 | 
13 |     page0_footnotes = pdf_document.pages[0].contained_blocks(pdf_document, [BlockTypes.Footnote])
14 |     assert len(page0_footnotes) >= 2
15 | 
16 |     assert page0_footnotes[-1].raw_text(pdf_document).strip().startswith("5")
17 | 
```

--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------

```markdown
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: "[FEAT]"
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## ✨ Is your feature request related to a problem?
11 | 
12 | A clear and concise description of what the problem is. 
13 | 
14 | ## 💡 Describe the Solution You'd Like
15 | 
16 | A concise description of what you want to happen or how you envision it working.
17 | 
18 | ## 📋 Alternatives Considered
19 | 
20 | Any alternative solutions or workarounds you've tried.
21 | 
22 | ## 🧩 Additional Context
23 | 
24 | Any additional context, references, or related issues.
25 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/reference.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.blocks import Block
 3 | 
 4 | 
 5 | class Reference(Block):
 6 |     block_type: BlockTypes = BlockTypes.Reference
 7 |     ref: str
 8 |     block_description: str = "A reference to this block from another block."
 9 | 
10 |     def assemble_html(
11 |         self, document, child_blocks, parent_structure=None, block_config=None
12 |     ):
13 |         template = super().assemble_html(
14 |             document, child_blocks, parent_structure, block_config
15 |         )
16 |         return f"<span id='{self.ref}'>{template}</span>"
17 | 
```

--------------------------------------------------------------------------------
/marker/utils/batch.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.utils.gpu import GPUManager
 2 | 
 3 | 
 4 | def get_batch_sizes_worker_counts(gpu_manager: GPUManager, peak_worker_vram: int):
 5 |     vram = gpu_manager.get_gpu_vram()
 6 | 
 7 |     workers = max(1, vram // peak_worker_vram)
 8 |     if workers == 1:
 9 |         return {}, workers
10 | 
11 |     return {
12 |         "layout_batch_size": 12,
13 |         "detection_batch_size": 8,
14 |         "table_rec_batch_size": 12,
15 |         "ocr_error_batch_size": 12,
16 |         "recognition_batch_size": 64,
17 |         "equation_batch_size": 16,
18 |         "detector_postprocessing_cpu_workers": 2,
19 |     }, workers
20 | 
```

--------------------------------------------------------------------------------
/tests/providers/test_pdf_provider.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.config({"page_range": [0]})
 5 | def test_pdf_provider(doc_provider):
 6 |     assert len(doc_provider) == 12
 7 |     assert doc_provider.get_images([0], 72)[0].size == (612, 792)
 8 |     assert doc_provider.get_images([0], 96)[0].size == (816, 1056)
 9 | 
10 |     page_lines = doc_provider.get_page_lines(0)
11 |     assert len(page_lines) == 85
12 | 
13 |     spans = page_lines[0].spans
14 |     assert len(spans) == 2
15 |     assert spans[0].text == "Subspace Adversarial Training"
16 |     assert spans[0].font == "NimbusRomNo9L-Medi"
17 |     assert spans[0].formats == ["plain"]
18 | 
```

--------------------------------------------------------------------------------
/tests/providers/test_image_provider.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.providers.image import ImageProvider
 2 | from marker.renderers.markdown import MarkdownOutput
 3 | 
 4 | 
 5 | def test_image_provider(config, temp_image):
 6 |     provider = ImageProvider(temp_image.name, config)
 7 |     assert len(provider) == 1
 8 |     assert provider.get_images([0], 72)[0].size == (512, 512)
 9 | 
10 |     page_lines = provider.get_page_lines(0)
11 |     assert len(page_lines) == 0
12 | 
13 | def test_image_provider_conversion(pdf_converter, temp_image):
14 |     markdown_output: MarkdownOutput = pdf_converter(temp_image.name)
15 |     assert "Hello, World!" in markdown_output.markdown
16 | 
17 | 
18 | 
```

--------------------------------------------------------------------------------
/marker/schema/groups/picture.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.groups.base import Group
 3 | 
 4 | 
 5 | class PictureGroup(Group):
 6 |     block_type: BlockTypes = BlockTypes.PictureGroup
 7 |     block_description: str = "A picture along with associated captions."
 8 |     html: str | None = None
 9 | 
10 |     def assemble_html(
11 |         self, document, child_blocks, parent_structure, block_config=None
12 |     ):
13 |         if self.html:
14 |             return self.html
15 | 
16 |         child_html = super().assemble_html(
17 |             document, child_blocks, parent_structure, block_config
18 |         )
19 |         return child_html
20 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/mathpix.py:
--------------------------------------------------------------------------------

```python
 1 | import datasets
 2 | 
 3 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
 4 | 
 5 | 
 6 | class MathpixMethod(BaseMethod):
 7 |     mathpix_ds: datasets.Dataset = None
 8 | 
 9 |     def __call__(self, sample) -> BenchmarkResult:
10 |         uuid = sample["uuid"]
11 |         data = None
12 |         for row in self.mathpix_ds:
13 |             if str(row["uuid"]) == str(uuid):
14 |                 data = row
15 |                 break
16 |         if not data:
17 |             raise ValueError(f"Could not find data for uuid {uuid}")
18 | 
19 |         return {
20 |             "markdown": data["md"],
21 |             "time": data["time"]
22 |         }
```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/mistral.py:
--------------------------------------------------------------------------------

```python
 1 | import datasets
 2 | 
 3 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
 4 | 
 5 | 
 6 | class MistralMethod(BaseMethod):
 7 |     mistral_ds: datasets.Dataset = None
 8 | 
 9 |     def __call__(self, sample) -> BenchmarkResult:
10 |         uuid = sample["uuid"]
11 |         data = None
12 |         for row in self.mistral_ds:
13 |             if str(row["uuid"]) == str(uuid):
14 |                 data = row
15 |                 break
16 |         if not data:
17 |             raise ValueError(f"Could not find data for uuid {uuid}")
18 | 
19 |         return {
20 |             "markdown": data["md"],
21 |             "time": data["time"]
22 |         }
```

--------------------------------------------------------------------------------
/marker/scripts/run_streamlit_app.py:
--------------------------------------------------------------------------------

```python
 1 | import subprocess
 2 | import os
 3 | import sys
 4 | 
 5 | 
 6 | def streamlit_app_cli(app_name: str = "streamlit_app.py"):
 7 |     argv = sys.argv[1:]
 8 |     cur_dir = os.path.dirname(os.path.abspath(__file__))
 9 |     app_path = os.path.join(cur_dir, app_name)
10 |     cmd = [
11 |         "streamlit",
12 |         "run",
13 |         app_path,
14 |         "--server.fileWatcherType",
15 |         "none",
16 |         "--server.headless",
17 |         "true",
18 |     ]
19 |     if argv:
20 |         cmd += ["--"] + argv
21 |     subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"})
22 | 
23 | 
24 | def extraction_app_cli():
25 |     streamlit_app_cli("extraction_app.py")
26 | 
```

--------------------------------------------------------------------------------
/marker/schema/groups/figure.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.groups.base import Group
 3 | 
 4 | 
 5 | class FigureGroup(Group):
 6 |     block_type: BlockTypes = BlockTypes.FigureGroup
 7 |     block_description: str = "A group that contains a figure and associated captions."
 8 |     html: str | None = None
 9 | 
10 |     def assemble_html(
11 |         self, document, child_blocks, parent_structure, block_config=None
12 |     ):
13 |         if self.html:
14 |             return self.html
15 | 
16 |         child_html = super().assemble_html(
17 |             document, child_blocks, parent_structure, block_config
18 |         )
19 |         return child_html
20 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/llamaparse.py:
--------------------------------------------------------------------------------

```python
 1 | import datasets
 2 | 
 3 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
 4 | 
 5 | 
 6 | class LlamaParseMethod(BaseMethod):
 7 |     llamaparse_ds: datasets.Dataset = None
 8 | 
 9 |     def __call__(self, sample) -> BenchmarkResult:
10 |         uuid = sample["uuid"]
11 |         data = None
12 |         for row in self.llamaparse_ds:
13 |             if str(row["uuid"]) == str(uuid):
14 |                 data = row
15 |                 break
16 |         if not data:
17 |             raise ValueError(f"Could not find data for uuid {uuid}")
18 | 
19 |         return {
20 |             "markdown": data["md"],
21 |             "time": data["time"]
22 |         }
```

--------------------------------------------------------------------------------
/marker/schema/blocks/handwriting.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.blocks import Block
 3 | 
 4 | 
 5 | class Handwriting(Block):
 6 |     block_type: BlockTypes = BlockTypes.Handwriting
 7 |     block_description: str = "A region that contains handwriting."
 8 |     html: str | None = None
 9 |     replace_output_newlines: bool = True
10 | 
11 |     def assemble_html(
12 |         self, document, child_blocks, parent_structure, block_config=None
13 |     ):
14 |         if self.html:
15 |             return self.html
16 |         else:
17 |             return super().assemble_html(
18 |                 document, child_blocks, parent_structure, block_config
19 |             )
20 | 
```

--------------------------------------------------------------------------------
/tests/schema/groups/test_list_grouping.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.builders.structure import StructureBuilder
 4 | from marker.schema import BlockTypes
 5 | 
 6 | 
 7 | @pytest.mark.config({"page_range": [4]})
 8 | @pytest.mark.skip(reason="Model breaks this up due to equations")
 9 | def test_list_grouping(pdf_document):
10 |     structure = StructureBuilder()
11 |     structure(pdf_document)
12 | 
13 |     page = pdf_document.pages[0]
14 |     list_groups = []
15 |     for block in page.children:
16 |         if block.block_type == BlockTypes.ListGroup:
17 |             list_groups.append(block)
18 | 
19 |     # The model breaks this up, since it has equations in it
20 |     assert len(list_groups) == 3
21 | 
```

--------------------------------------------------------------------------------
/tests/renderers/test_extract_images.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.renderers.markdown import MarkdownRenderer
 4 | 
 5 | 
 6 | @pytest.mark.config({"page_range": [0]})
 7 | @pytest.mark.filename("A17_FlightPlan.pdf")
 8 | def test_disable_extract_images(pdf_document):
 9 |     renderer = MarkdownRenderer({"extract_images": False})
10 |     md = renderer(pdf_document).markdown
11 | 
12 |     # Verify markdown
13 |     assert "jpeg" not in md
14 | 
15 | 
16 | @pytest.mark.config({"page_range": [0]})
17 | @pytest.mark.filename("A17_FlightPlan.pdf")
18 | def test_extract_images(pdf_document):
19 |     renderer = MarkdownRenderer()
20 |     md = renderer(pdf_document).markdown
21 | 
22 |     # Verify markdown
23 |     assert "jpeg" in md
```

--------------------------------------------------------------------------------
/marker/renderers/extraction.py:
--------------------------------------------------------------------------------

```python
 1 | from pydantic import BaseModel
 2 | 
 3 | from marker.extractors.document import DocumentExtractionSchema
 4 | from marker.renderers import BaseRenderer
 5 | 
 6 | 
 7 | class ExtractionOutput(BaseModel):
 8 |     analysis: str
 9 |     document_json: str
10 |     original_markdown: str
11 | 
12 | 
13 | class ExtractionRenderer(BaseRenderer):
14 |     def __call__(
15 |         self, output: DocumentExtractionSchema, markdown: str
16 |     ) -> ExtractionOutput:
17 |         # We definitely want to do more complex stuff here soon, so leave it in
18 |         return ExtractionOutput(
19 |             analysis=output.analysis,
20 |             document_json=output.document_json,
21 |             original_markdown=markdown,
22 |         )
23 | 
```

--------------------------------------------------------------------------------
/marker/scripts/chunk_convert.py:
--------------------------------------------------------------------------------

```python
 1 | import argparse
 2 | import os
 3 | import subprocess
 4 | import pkg_resources
 5 | 
 6 | 
 7 | def chunk_convert_cli():
 8 |     parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.")
 9 |     parser.add_argument("in_folder", help="Input folder with pdfs.")
10 |     parser.add_argument("out_folder", help="Output folder")
11 |     args = parser.parse_args()
12 | 
13 |     cur_dir = os.path.dirname(os.path.abspath(__file__))
14 |     script_path = os.path.join(cur_dir, "chunk_convert.sh")
15 | 
16 |     # Construct the command
17 |     cmd = f"{script_path} {args.in_folder} {args.out_folder}"
18 | 
19 |     # Execute the shell script
20 |     subprocess.run(cmd, shell=True, check=True)
```

--------------------------------------------------------------------------------
/tests/processors/test_ignoretext.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.processors.ignoretext import IgnoreTextProcessor
 4 | from marker.schema import BlockTypes
 5 | 
 6 | 
 7 | @pytest.mark.filename("bio_pdf.pdf")
 8 | @pytest.mark.config({"page_range": list(range(10))})
 9 | @pytest.mark.skip(reason="New layout model correctly identifies the block as a PageHeader, so nothing to be done by the IgnoreTextProcessor")
10 | def test_ignoretext_processor(pdf_document):
11 |     processor = IgnoreTextProcessor()
12 |     processor(pdf_document)
13 | 
14 |     page1_header = pdf_document.pages[1].contained_blocks(pdf_document, [BlockTypes.Text])[0]
15 |     assert "bioRxiv" in page1_header.raw_text(pdf_document)
16 | 
17 |     assert page1_header.ignore_for_output is True
18 | 
```

--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------

```yaml
 1 | name: CI tests
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   tests:
 7 |     runs-on: t4_gpu
 8 |     steps:
 9 |       - uses: actions/checkout@v3
10 |       - name: Install apt requirements
11 |         run: |
12 |           sudo apt-get update
13 |           sudo apt-get install -y libpango-1.0-0 libharfbuzz0b libpangoft2-1.0-0 libgdk-pixbuf2.0-0 libcairo2 libffi-dev shared-mime-info
14 |       - name: Set up Python 3.11
15 |         uses: actions/setup-python@v4
16 |         with:
17 |           python-version: 3.11
18 |       - name: Install python dependencies
19 |         run: |
20 |           pip install poetry
21 |           poetry install --extras "full"
22 |       - name: Run tests
23 |         env:
24 |           HF_TOKEN: ${{ secrets.HF_TOKEN }}
25 |         run: poetry run pytest
26 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/footnote.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.blocks import Block
 3 | 
 4 | 
 5 | class Footnote(Block):
 6 |     block_type: BlockTypes = BlockTypes.Footnote
 7 |     block_description: str = (
 8 |         "A footnote that explains a term or concept in the document."
 9 |     )
10 |     replace_output_newlines: bool = True
11 |     html: str | None = None
12 | 
13 |     def assemble_html(
14 |         self, document, child_blocks, parent_structure, block_config=None
15 |     ):
16 |         if self.html:
17 |             return super().handle_html_output(
18 |                 document, child_blocks, parent_structure, block_config
19 |             )
20 | 
21 |         return super().assemble_html(
22 |             document, child_blocks, parent_structure, block_config
23 |         )
24 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/gt.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import List
 2 | import json
 3 | 
 4 | from PIL import Image
 5 | 
 6 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
 7 | 
 8 | 
 9 | class GTMethod(BaseMethod):
10 |     def __call__(self, sample) -> BenchmarkResult:
11 |         gt_blocks = json.loads(sample["gt_blocks"])
12 |         gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
13 |         gt_markdown = [self.convert_to_md(block) for block in gt_html]
14 |         return {
15 |             "markdown": gt_markdown,
16 |             "time": 0
17 |         }
18 | 
19 |     def render(self, html: List[str]) -> Image.Image:
20 |         joined = "\n\n".join(html)
21 |         html = f"""
22 | <html>
23 | <head></head>
24 | <body>
25 | {joined}
26 | </body>
27 | </html>
28 | """.strip()
29 |         return self.html_to_image(html)
```

--------------------------------------------------------------------------------
/marker/schema/blocks/caption.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.blocks import Block
 3 | 
 4 | 
 5 | class Caption(Block):
 6 |     block_type: BlockTypes = BlockTypes.Caption
 7 |     block_description: str = "A text caption that is directly above or below an image or table. Only used for text describing the image or table.  "
 8 |     replace_output_newlines: bool = True
 9 |     html: str | None = None
10 | 
11 |     def assemble_html(self, document, child_blocks, parent_structure, block_config):
12 |         if self.html:
13 |             return super().handle_html_output(
14 |                 document, child_blocks, parent_structure, block_config
15 |             )
16 | 
17 |         return super().assemble_html(
18 |             document, child_blocks, parent_structure, block_config
19 |         )
20 | 
```

--------------------------------------------------------------------------------
/marker/schema/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | from enum import auto, Enum
 2 | 
 3 | 
 4 | class BlockTypes(str, Enum):
 5 |     Line = auto()
 6 |     Span = auto()
 7 |     Char = auto()
 8 |     FigureGroup = auto()
 9 |     TableGroup = auto()
10 |     ListGroup = auto()
11 |     PictureGroup = auto()
12 |     Page = auto()
13 |     Caption = auto()
14 |     Code = auto()
15 |     Figure = auto()
16 |     Footnote = auto()
17 |     Form = auto()
18 |     Equation = auto()
19 |     Handwriting = auto()
20 |     TextInlineMath = auto()
21 |     ListItem = auto()
22 |     PageFooter = auto()
23 |     PageHeader = auto()
24 |     Picture = auto()
25 |     SectionHeader = auto()
26 |     Table = auto()
27 |     Text = auto()
28 |     TableOfContents = auto()
29 |     Document = auto()
30 |     ComplexRegion = auto()
31 |     TableCell = auto()
32 |     Reference = auto()
33 | 
34 |     def __str__(self):
35 |         return self.name
36 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/docling.py:
--------------------------------------------------------------------------------

```python
 1 | import tempfile
 2 | import time
 3 | 
 4 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
 5 | 
 6 | 
 7 | class DoclingMethod(BaseMethod):
 8 |     model_dict: dict = None
 9 |     use_llm: bool = False
10 | 
11 |     def __call__(self, sample) -> BenchmarkResult:
12 |         from docling.document_converter import DocumentConverter
13 |         pdf_bytes = sample["pdf"]  # This is a single page PDF
14 |         converter = DocumentConverter()
15 | 
16 |         with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
17 |             f.write(pdf_bytes)
18 |             start = time.time()
19 |             result = converter.convert(f.name)
20 |             total = time.time() - start
21 | 
22 |         return {
23 |             "markdown": result.document.export_to_markdown(),
24 |             "time": total
25 |         }
26 | 
27 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/pagefooter.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.blocks import Block
 3 | 
 4 | 
 5 | class PageFooter(Block):
 6 |     block_type: str = BlockTypes.PageFooter
 7 |     block_description: str = (
 8 |         "Text that appears at the bottom of a page, like a page number."
 9 |     )
10 |     replace_output_newlines: bool = True
11 |     ignore_for_output: bool = True
12 |     html: str | None = None
13 | 
14 |     def assemble_html(self, document, child_blocks, parent_structure, block_config):
15 |         if block_config and block_config.get("keep_pagefooter_in_output"):
16 |             self.ignore_for_output = False
17 | 
18 |         if self.html and not self.ignore_for_output:
19 |             return self.html
20 | 
21 |         return super().assemble_html(
22 |             document, child_blocks, parent_structure, block_config
23 |         )
24 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/pageheader.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.blocks import Block
 3 | 
 4 | 
 5 | class PageHeader(Block):
 6 |     block_type: BlockTypes = BlockTypes.PageHeader
 7 |     block_description: str = (
 8 |         "Text that appears at the top of a page, like a page title."
 9 |     )
10 |     replace_output_newlines: bool = True
11 |     ignore_for_output: bool = True
12 |     html: str | None = None
13 | 
14 |     def assemble_html(self, document, child_blocks, parent_structure, block_config):
15 |         if block_config and block_config.get("keep_pageheader_in_output"):
16 |             self.ignore_for_output = False
17 | 
18 |         if self.html and not self.ignore_for_output:
19 |             return self.html
20 | 
21 |         return super().assemble_html(
22 |             document, child_blocks, parent_structure, block_config
23 |         )
24 | 
```

--------------------------------------------------------------------------------
/marker/schema/groups/table.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import List
 2 | 
 3 | from marker.schema import BlockTypes
 4 | from marker.schema.blocks import BlockOutput
 5 | from marker.schema.groups.base import Group
 6 | 
 7 | 
 8 | class TableGroup(Group):
 9 |     block_type: BlockTypes = BlockTypes.TableGroup
10 |     block_description: str = "A table along with associated captions."
11 |     html: str | None = None
12 | 
13 |     def assemble_html(
14 |         self,
15 |         document,
16 |         child_blocks: List[BlockOutput],
17 |         parent_structure=None,
18 |         block_config: dict | None = None,
19 |     ):
20 |         if self.html:
21 |             return self.handle_html_output(
22 |                 document, child_blocks, parent_structure, block_config
23 |             )
24 | 
25 |         return super().assemble_html(
26 |             document, child_blocks, parent_structure, block_config
27 |         )
28 | 
```

--------------------------------------------------------------------------------
/tests/builders/test_rotated_bboxes.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.schema import BlockTypes
 4 | 
 5 | 
 6 | @pytest.mark.config({"page_range": [0]})
 7 | @pytest.mark.filename("adversarial_rot.pdf")
 8 | def test_rotated_bboxes(pdf_document):
 9 |     first_page = pdf_document.pages[0]
10 | 
11 |     # Ensure we match all text lines up properly
12 |     text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
13 |     text_blocks = first_page.contained_blocks(
14 |         pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath)
15 |     )
16 |     # assert len(text_lines) == 84
17 | 
18 |     # Ensure the bbox sizes match up
19 |     max_line_position = max([line.polygon.x_end for line in text_lines])
20 |     max_block_position = max(
21 |         [block.polygon.x_end for block in text_blocks if block.source == "layout"]
22 |     )
23 |     assert max_line_position <= max_block_position
24 | 
```

--------------------------------------------------------------------------------
/marker/processors/document_toc.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.processors import BaseProcessor
 2 | from marker.schema import BlockTypes
 3 | from marker.schema.document import Document
 4 | 
 5 | 
 6 | class DocumentTOCProcessor(BaseProcessor):
 7 |     """
 8 |     A processor for generating a table of contents for the document.
 9 |     """
10 |     block_types = (BlockTypes.SectionHeader, )
11 | 
12 |     def __call__(self, document: Document):
13 |         toc = []
14 |         for page in document.pages:
15 |             for block in page.contained_blocks(document, self.block_types):
16 |                 toc.append({
17 |                     "title": block.raw_text(document).strip(),
18 |                     "heading_level": block.heading_level,
19 |                     "page_id": page.page_id,
20 |                     "polygon": block.polygon.polygon
21 |                 })
22 |         document.table_of_contents = toc
23 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/registry.py:
--------------------------------------------------------------------------------

```python
 1 | from benchmarks.overall.methods.docling import DoclingMethod
 2 | from benchmarks.overall.methods.gt import GTMethod
 3 | from benchmarks.overall.methods.llamaparse import LlamaParseMethod
 4 | from benchmarks.overall.methods.marker import MarkerMethod
 5 | from benchmarks.overall.methods.mathpix import MathpixMethod
 6 | from benchmarks.overall.methods.mistral import MistralMethod
 7 | from benchmarks.overall.methods.olmocr import OlmOCRMethod
 8 | from benchmarks.overall.scorers.heuristic import HeuristicScorer
 9 | from benchmarks.overall.scorers.llm import LLMScorer
10 | 
11 | SCORE_REGISTRY = {
12 |     "heuristic": HeuristicScorer,
13 |     "llm": LLMScorer
14 | }
15 | 
16 | METHOD_REGISTRY = {
17 |     "marker": MarkerMethod,
18 |     "gt": GTMethod,
19 |     "mathpix": MathpixMethod,
20 |     "llamaparse": LlamaParseMethod,
21 |     "docling": DoclingMethod,
22 |     "olmocr": OlmOCRMethod,
23 |     "mistral": MistralMethod
24 | }
```

--------------------------------------------------------------------------------
/marker/processors/page_header.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.processors import BaseProcessor
 2 | from marker.schema import BlockTypes
 3 | from marker.schema.document import Document
 4 | from marker.schema.groups.page import PageGroup
 5 | 
 6 | 
 7 | class PageHeaderProcessor(BaseProcessor):
 8 |     """
 9 |     A processor for moving PageHeaders to the top
10 |     """
11 |     block_types = (BlockTypes.PageHeader,)
12 | 
13 |     def __call__(self, document: Document):
14 |         for page in document.pages:
15 |             self.move_page_header_to_top(page, document)
16 | 
17 |     def move_page_header_to_top(self, page: PageGroup, document: Document):
18 |         page_header_blocks = page.contained_blocks(document, self.block_types)
19 |         page_header_block_ids = [block.id for block in page_header_blocks]
20 |         for block_id in page_header_block_ids:
21 |             page.structure.remove(block_id)
22 |         page.structure[:0] = page_header_block_ids
23 | 
24 | 
```

--------------------------------------------------------------------------------
/marker/schema/groups/list.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.groups.base import Group
 3 | 
 4 | 
 5 | class ListGroup(Group):
 6 |     block_type: BlockTypes = BlockTypes.ListGroup
 7 |     has_continuation: bool = False
 8 |     block_description: str = "A group of list items that should be rendered together."
 9 |     html: str | None = None
10 | 
11 |     def assemble_html(
12 |         self, document, child_blocks, parent_structure, block_config=None
13 |     ):
14 |         if self.html:
15 |             return self.handle_html_output(
16 |                 document, child_blocks, parent_structure, block_config
17 |             )
18 | 
19 |         template = super().assemble_html(
20 |             document, child_blocks, parent_structure, block_config
21 |         )
22 | 
23 |         el_attr = f" block-type='{self.block_type}'"
24 |         if self.has_continuation:
25 |             el_attr += " class='has-continuation'"
26 |         return f"<p{el_attr}><ul>{template}</ul></p>"
27 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/download/main.py:
--------------------------------------------------------------------------------

```python
 1 | import click
 2 | 
 3 | from benchmarks.overall.download.llamaparse import LlamaParseDownloader
 4 | from benchmarks.overall.download.mathpix import MathpixDownloader
 5 | from benchmarks.overall.download.mistral import MistralDownloader
 6 | 
 7 | 
 8 | @click.command("Download data from inference services")
 9 | @click.argument("service", type=click.Choice(["mathpix", "llamaparse", "mistral"]))
10 | @click.option("--max_rows", type=int, default=2200)
11 | @click.option("--api_key", type=str, default=None)
12 | @click.option("--app_id", type=str, default=None)
13 | def main(service: str, max_rows: int, api_key: str, app_id: str):
14 |     registry = {
15 |         "mathpix": MathpixDownloader,
16 |         "llamaparse": LlamaParseDownloader,
17 |         "mistral": MistralDownloader,
18 |     }
19 |     downloader = registry[service](api_key, app_id, max_rows=max_rows)
20 | 
21 |     # Generate data and upload to hub
22 |     downloader()
23 | 
24 | if __name__ == "__main__":
25 |     main()
26 | 
```

--------------------------------------------------------------------------------
/marker/logger.py:
--------------------------------------------------------------------------------

```python
 1 | import logging
 2 | import warnings
 3 | 
 4 | from marker.settings import settings
 5 | 
 6 | 
 7 | def configure_logging():
 8 |     # Setup marker logger
 9 |     logger = get_logger()
10 | 
11 |     if not logger.handlers:
12 |         handler = logging.StreamHandler()
13 |         formatter = logging.Formatter(
14 |             "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
15 |         )
16 |         handler.setFormatter(formatter)
17 |         logger.addHandler(handler)
18 | 
19 |     logger.setLevel(settings.LOGLEVEL)
20 | 
21 |     # Ignore future warnings
22 |     warnings.simplefilter(action="ignore", category=FutureWarning)
23 | 
24 |     # Set component loglevels
25 |     logging.getLogger("PIL").setLevel(logging.ERROR)
26 |     logging.getLogger("fontTools.subset").setLevel(logging.ERROR)
27 |     logging.getLogger("fontTools.ttLib.ttFont").setLevel(logging.ERROR)
28 |     logging.getLogger("weasyprint").setLevel(logging.CRITICAL)
29 | 
30 | 
31 | def get_logger():
32 |     return logging.getLogger("marker")
33 | 
```

--------------------------------------------------------------------------------
/tests/processors/test_table_merge.py:
--------------------------------------------------------------------------------

```python
 1 | from unittest.mock import Mock
 2 | 
 3 | import pytest
 4 | 
 5 | from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor
 6 | from marker.processors.table import TableProcessor
 7 | from marker.schema import BlockTypes
 8 | 
 9 | 
10 | @pytest.mark.filename("table_ex2.pdf")
11 | def test_llm_table_processor_nomerge(pdf_document, table_rec_model, recognition_model, detection_model, mocker):
12 |     mock_cls = Mock()
13 |     mock_cls.return_value = {
14 |         "merge": "true",
15 |         "direction": "right"
16 |     }
17 | 
18 |     cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model)
19 |     cell_processor(pdf_document)
20 | 
21 |     tables = pdf_document.contained_blocks((BlockTypes.Table,))
22 |     assert len(tables) == 3
23 | 
24 |     processor = LLMTableMergeProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"})
25 |     processor(pdf_document)
26 | 
27 |     tables = pdf_document.contained_blocks((BlockTypes.Table,))
28 |     assert len(tables) == 3
```

--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------

```yaml
 1 | name: Python package
 2 | on:
 3 |   push:
 4 |     tags:
 5 |       - "v*.*.*"
 6 | jobs:
 7 |   build:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/checkout@v3
11 |       - name: Set up Python 3.11
12 |         uses: actions/setup-python@v4
13 |         with:
14 |           python-version: 3.11
15 |       - name: Install python dependencies
16 |         run: |
17 |           pip install poetry
18 |           poetry install --extras "full"
19 |       - name: Build package
20 |         run: |
21 |           poetry build
22 |       - name: Extract version from pyproject.toml
23 |         id: version
24 |         run: |
25 |           VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['tool']['poetry']['version'])")
26 |           echo "version=v$VERSION" >> $GITHUB_OUTPUT
27 |       - name: Publish package
28 |         env:
29 |           PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
30 |         run: |
31 |           poetry config pypi-token.pypi "$PYPI_TOKEN"
32 |           poetry publish
33 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/sectionheader.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Optional
 2 | 
 3 | from marker.schema import BlockTypes
 4 | from marker.schema.blocks import Block
 5 | 
 6 | 
 7 | class SectionHeader(Block):
 8 |     block_type: BlockTypes = BlockTypes.SectionHeader
 9 |     heading_level: Optional[int] = None
10 |     block_description: str = "The header of a section of text or other blocks."
11 |     html: str | None = None
12 | 
13 |     def assemble_html(
14 |         self, document, child_blocks, parent_structure, block_config=None
15 |     ):
16 |         if self.ignore_for_output:
17 |             return ""
18 | 
19 |         if self.html:
20 |             return super().handle_html_output(
21 |                 document, child_blocks, parent_structure, block_config
22 |             )
23 | 
24 |         template = super().assemble_html(
25 |             document, child_blocks, parent_structure, block_config
26 |         )
27 |         template = template.replace("\n", " ")
28 |         tag = f"h{self.heading_level}" if self.heading_level else "h2"
29 |         return f"<{tag}>{template}</{tag}>"
30 | 
```

--------------------------------------------------------------------------------
/tests/renderers/test_chunk_renderer.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.renderers.chunk import ChunkRenderer
 4 | 
 5 | 
 6 | @pytest.mark.config({"page_range": [0]})
 7 | def test_chunk_renderer(pdf_document):
 8 |     renderer = ChunkRenderer()
 9 |     chunk_output = renderer(pdf_document)
10 |     blocks = chunk_output.blocks
11 |     page_info = chunk_output.page_info
12 | 
13 |     assert len(blocks) == 14
14 |     assert blocks[0].block_type == "SectionHeader"
15 |     assert page_info[0]["bbox"] is not None
16 |     assert page_info[0]["polygon"] is not None
17 | 
18 |     figure_groups = [block for block in blocks if block.block_type == "FigureGroup"]
19 |     figures = [block for block in blocks if block.block_type == "Figure"]
20 |     captions = [block for block in blocks if block.block_type == "Caption"]
21 | 
22 |     assert len(figure_groups) == 1
23 |     assert len(figures) == 0
24 |     assert len(captions) == 0
25 | 
26 |     figure_group = figure_groups[0]
27 |     assert figure_group.images is not None
28 |     assert len(figure_group.images) == 1
29 |     assert "<img src='/page/0/Figure/9'>" in figure_group.html
```

--------------------------------------------------------------------------------
/marker/providers/html.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | import tempfile
 3 | 
 4 | from marker.providers.pdf import PdfProvider
 5 | 
 6 | 
 7 | class HTMLProvider(PdfProvider):
 8 |     def __init__(self, filepath: str, config=None):
 9 |         temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
10 |         self.temp_pdf_path = temp_pdf.name
11 |         temp_pdf.close()
12 | 
13 |         # Convert HTML to PDF
14 |         try:
15 |             self.convert_html_to_pdf(filepath)
16 |         except Exception as e:
17 |             raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}")
18 | 
19 |         # Initialize the PDF provider with the temp pdf path
20 |         super().__init__(self.temp_pdf_path, config)
21 | 
22 |     def __del__(self):
23 |         if os.path.exists(self.temp_pdf_path):
24 |             os.remove(self.temp_pdf_path)
25 | 
26 |     def convert_html_to_pdf(self, filepath: str):
27 |         from weasyprint import HTML
28 | 
29 |         font_css = self.get_font_css()
30 |         HTML(filename=filepath, encoding="utf-8").write_pdf(
31 |             self.temp_pdf_path, stylesheets=[font_css]
32 |         )
33 | 
```

--------------------------------------------------------------------------------
/data/latex_to_md.sh:
--------------------------------------------------------------------------------

```bash
 1 | #!/bin/bash
 2 | 
 3 | # List all .tex files in the latex folder
 4 | FILES=$(find latex -name "*.tex")
 5 | 
 6 | for f in $FILES
 7 | do
 8 |   echo "Processing $f file..."
 9 |   base_name=$(basename "$f" .tex)
10 |   out_file="references/${base_name}.md"
11 | 
12 |  pandoc --wrap=none \
13 |          --no-highlight \
14 |          --strip-comments \
15 |          --from=latex \
16 |          --to=commonmark_x+pipe_tables \
17 |          "$f" \
18 |          -o "$out_file"
19 |   # Replace non-breaking spaces
20 |   sed -i .bak 's/ / /g' "$out_file"
21 |   sed -i .bak 's/ / /g' "$out_file"
22 |   sed -i .bak 's/ / /g' "$out_file"
23 |   sed -i .bak 's/ / /g' "$out_file"
24 |   sed -i.bak -E 's/`\\cite`//g; s/<[^>]*>//g; s/\{[^}]*\}//g; s/\\cite\{[^}]*\}//g' "$out_file"
25 |     sed -i.bak -E '
26 |     s/`\\cite`//g;   # Remove \cite commands inside backticks
27 |     s/::: //g;       # Remove the leading ::: for content markers
28 |     s/\[//g;         # Remove opening square bracket
29 |     s/\]//g;         # Remove closing square bracket
30 |   ' "$out_file"
31 |   # Remove .bak file
32 |   rm "$out_file.bak"
33 | done
34 | 
35 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/figure.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.blocks import Block
 3 | 
 4 | 
 5 | class Figure(Block):
 6 |     block_type: BlockTypes = BlockTypes.Figure
 7 |     description: str | None = None
 8 |     html: str | None = None
 9 |     block_description: str = "A chart or other image that contains data."
10 | 
11 |     def assemble_html(
12 |         self, document, child_blocks, parent_structure, block_config=None
13 |     ):
14 |         if self.html:
15 |             return super().handle_html_output(
16 |                 document, child_blocks, parent_structure, block_config
17 |             )
18 | 
19 |         child_ref_blocks = [
20 |             block
21 |             for block in child_blocks
22 |             if block.id.block_type == BlockTypes.Reference
23 |         ]
24 |         html = super().assemble_html(
25 |             document, child_ref_blocks, parent_structure, block_config
26 |         )
27 |         if self.description:
28 |             html += f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>"
29 |         return html
30 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/equation.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.blocks import Block
 3 | 
 4 | 
 5 | class Equation(Block):
 6 |     block_type: BlockTypes = BlockTypes.Equation
 7 |     html: str | None = None
 8 |     block_description: str = "A block math equation."
 9 | 
10 |     def assemble_html(
11 |         self, document, child_blocks, parent_structure=None, block_config=None
12 |     ):
13 |         if self.html:
14 |             child_ref_blocks = [
15 |                 block
16 |                 for block in child_blocks
17 |                 if block.id.block_type == BlockTypes.Reference
18 |             ]
19 |             html_out = super().assemble_html(
20 |                 document, child_ref_blocks, parent_structure, block_config
21 |             )
22 |             html_out += f"""<p block-type='{self.block_type}'>{self.html}</p>"""
23 |             return html_out
24 |         else:
25 |             template = super().assemble_html(
26 |                 document, child_blocks, parent_structure, block_config
27 |             )
28 |             return f"<p block-type='{self.block_type}'>{template}</p>"
29 | 
```

--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/output-bug-report.md:
--------------------------------------------------------------------------------

```markdown
 1 | ---
 2 | name: Output bug report
 3 | about: Create a report about poor output quality
 4 | title: "[BUG: Output]"
 5 | labels: 'bug: output'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## 📝 Describe the Output Issue
11 | 
12 | A clear and concise description of the incorrect or unexpected output.
13 | 
14 | ## 📄 Input Document
15 | 
16 | Attach the PDF or input file used.
17 | 
18 | ## 📤 Current Output
19 | 
20 | Paste the Markdown or HTML that Marker generated:
21 | 
22 | ````markdown
23 | Paste output here
24 | `````
25 | 
26 | ## ✅ Expected Output
27 | 
28 | Describe or paste what you expected Marker to generate.
29 | 
30 | ## ⚙️ Environment
31 | 
32 | Please fill in all relevant details:
33 | 
34 | * **Marker version**:
35 | * **Surya version**:
36 | * **Python version**:
37 | * **PyTorch version**:
38 | * **Transformers version**:
39 | * **Operating System**:
40 | 
41 | ## 📟 Command or Code Used
42 | 
43 | Paste the **exact bash command** or **Python code** you used to run Marker:
44 | 
45 | <details>
46 | <summary>Click to expand</summary>
47 | 
48 | ```bash
49 | # or Python code block
50 | your_command_here --with-flags
51 | ```
52 | 
53 | </details>
54 | 
55 | ## 📎 Additional Context
56 | 
57 | Any other relevant info, configs, or assumptions.
58 | 
```

--------------------------------------------------------------------------------
/tests/converters/test_table_converter.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | from marker.converters.table import TableConverter
 3 | from marker.renderers.markdown import MarkdownOutput
 4 | from marker.util import classes_to_strings
 5 | 
 6 | def _table_converter(config, model_dict, renderer, temp_pdf):
 7 |     converter = TableConverter(
 8 |         artifact_dict=model_dict,
 9 |         processor_list=None,
10 |         renderer=classes_to_strings([renderer])[0],
11 |         config=config
12 |     )
13 | 
14 |     markdown_output: MarkdownOutput = converter(temp_pdf.name)
15 |     markdown = markdown_output.markdown
16 | 
17 |     assert len(markdown) > 0
18 |     assert "cyclic" in markdown
19 | 
20 | 
21 | @pytest.mark.output_format("markdown")
22 | @pytest.mark.config({"page_range": [5]})
23 | def test_table_converter(config, model_dict, renderer, temp_doc):
24 |     _table_converter(config, model_dict, renderer, temp_doc)
25 | 
26 | @pytest.mark.output_format("markdown")
27 | @pytest.mark.config({"page_range": [5], "force_ocr": True})
28 | def test_table_converter_ocr(config, model_dict, renderer, temp_doc):
29 |     _table_converter(config, model_dict, renderer, temp_doc)
30 | 
31 | 
```

--------------------------------------------------------------------------------
/tests/builders/test_blank_page.py:
--------------------------------------------------------------------------------

```python
 1 | from surya.layout.schema import LayoutResult
 2 | 
 3 | from marker.builders.document import DocumentBuilder
 4 | from marker.builders.layout import LayoutBuilder
 5 | from marker.builders.line import LineBuilder
 6 | 
 7 | 
 8 | def test_blank_page(config, doc_provider, layout_model, ocr_error_model, detection_model):
 9 |     layout_builder = LayoutBuilder(layout_model, config)
10 |     line_builder = LineBuilder(detection_model, ocr_error_model)
11 |     builder = DocumentBuilder(config)
12 |     document = builder.build_document(doc_provider)
13 | 
14 |     layout_results = [LayoutResult(
15 |         bboxes=[],
16 |         image_bbox=p.polygon.bbox,
17 |     ) for p in document.pages]
18 |     provider_lines = {p.page_id: [] for p in document.pages}
19 |     ocr_lines = {p.page_id: [] for p in document.pages}
20 | 
21 |     layout_builder.add_blocks_to_pages(document.pages, layout_results)
22 |     line_builder.merge_blocks(document, provider_lines, ocr_lines)
23 | 
24 |     assert all([isinstance(p.children, list) for p in document.pages])
25 |     assert all([isinstance(p.structure, list) for p in document.pages])
```

--------------------------------------------------------------------------------
/marker/schema/blocks/complexregion.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.blocks import Block
 3 | 
 4 | 
 5 | class ComplexRegion(Block):
 6 |     block_type: BlockTypes = BlockTypes.ComplexRegion
 7 |     html: str | None = None
 8 |     block_description: str = "A complex region that can consist of multiple different types of blocks mixed with images. This block is chosen when it is difficult to categorize the region as a single block type."
 9 | 
10 |     def assemble_html(self, document, child_blocks, parent_structure, block_config):
11 |         if self.html:
12 |             child_ref_blocks = [
13 |                 block
14 |                 for block in child_blocks
15 |                 if block.id.block_type == BlockTypes.Reference
16 |             ]
17 |             html = super().assemble_html(
18 |                 document, child_ref_blocks, parent_structure, block_config
19 |             )
20 |             return html + self.html
21 |         else:
22 |             template = super().assemble_html(
23 |                 document, child_blocks, parent_structure, block_config
24 |             )
25 |             return f"<p>{template}</p>"
26 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/picture.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.blocks import Block
 3 | 
 4 | 
 5 | class Picture(Block):
 6 |     block_type: BlockTypes = BlockTypes.Picture
 7 |     description: str | None = None
 8 |     block_description: str = "An image block that represents a picture."
 9 |     html: str | None = None
10 | 
11 |     def assemble_html(
12 |         self, document, child_blocks, parent_structure, block_config=None
13 |     ):
14 |         if self.html:
15 |             return super().handle_html_output(
16 |                 document, child_blocks, parent_structure, block_config
17 |             )
18 | 
19 |         child_ref_blocks = [
20 |             block
21 |             for block in child_blocks
22 |             if block.id.block_type == BlockTypes.Reference
23 |         ]
24 |         html = super().assemble_html(
25 |             document, child_ref_blocks, parent_structure, block_config
26 |         )
27 | 
28 |         if self.description:
29 |             return (
30 |                 html
31 |                 + f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>"
32 |             )
33 |         return html
34 | 
```

--------------------------------------------------------------------------------
/.github/workflows/benchmarks.yml:
--------------------------------------------------------------------------------

```yaml
 1 | name: Integration test
 2 | 
 3 | on: [push]
 4 | 
 5 | env:
 6 |   PYTHONIOENCODING: "utf-8"
 7 | 
 8 | jobs:
 9 |   benchmark:
10 |     runs-on: ${{ matrix.os }}
11 |     strategy:
12 |       matrix:
13 |         os: [t4_gpu, ubuntu-latest]
14 |     steps:
15 |       - uses: actions/checkout@v3
16 |       - name: Set up Python 3.11
17 |         uses: actions/setup-python@v4
18 |         with:
19 |           python-version: 3.11
20 |       - name: Install apt dependencies
21 |         run: |
22 |           sudo apt-get update
23 |           sudo apt-get install -y pandoc
24 |       - name: Install python dependencies
25 |         run: |
26 |           pip install poetry
27 |           poetry install --extras "full"
28 |       - name: Run benchmark test
29 |         run: |
30 |           poetry run python benchmarks/overall/overall.py --max_rows 5
31 |           poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/result.json --type marker
32 |       - name: Run table benchmark
33 |         run: |
34 |           poetry run python benchmarks/table/table.py --max_rows 5
35 |           poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table
```

--------------------------------------------------------------------------------
/marker/services/vertex.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Annotated
 2 | 
 3 | from google import genai
 4 | 
 5 | from marker.services.gemini import BaseGeminiService
 6 | 
 7 | class GoogleVertexService(BaseGeminiService):
 8 |     vertex_project_id: Annotated[
 9 |         str,
10 |         "Google Cloud Project ID for Vertex AI.",
11 |     ] = None
12 |     vertex_location: Annotated[
13 |         str,
14 |         "Google Cloud Location for Vertex AI.",
15 |     ] = "us-central1"
16 |     gemini_model_name: Annotated[
17 |         str,
18 |         "The name of the Google model to use for the service."
19 |     ] = "gemini-2.0-flash-001"
20 |     vertex_dedicated: Annotated[
21 |         bool,
22 |         "Whether to use a dedicated Vertex AI instance."
23 |     ] = False
24 | 
25 |     def get_google_client(self, timeout: int):
26 |         http_options = {"timeout": timeout * 1000} # Convert to milliseconds
27 |         if self.vertex_dedicated:
28 |             http_options["headers"] = {"x-vertex-ai-llm-request-type": "dedicated"}
29 |         return genai.Client(
30 |             vertexai=True,
31 |             project=self.vertex_project_id,
32 |             location=self.vertex_location,
33 |             http_options=http_options,
34 |         )
```

--------------------------------------------------------------------------------
/marker/scripts/chunk_convert.sh:
--------------------------------------------------------------------------------

```bash
 1 | #!/bin/bash
 2 | 
 3 | trap 'pkill -P $$' SIGINT
 4 | 
 5 | # Check if NUM_DEVICES is set
 6 | if [[ -z "$NUM_DEVICES" ]]; then
 7 |     echo "Please set the NUM_DEVICES environment variable."
 8 |     exit 1
 9 | fi
10 | 
11 | if [[ -z "$NUM_WORKERS" ]]; then
12 |     echo "Please set the NUM_WORKERS environment variable."
13 |     exit 1
14 | fi
15 | 
16 | # Get input folder and output folder from args
17 | if [[ -z "$1" ]]; then
18 |     echo "Please provide an input folder."
19 |     exit 1
20 | fi
21 | 
22 | if [[ -z "$2" ]]; then
23 |     echo "Please provide an output folder."
24 |     exit 1
25 | fi
26 | 
27 | INPUT_FOLDER=$1
28 | OUTPUT_FOLDER=$2
29 | 
30 | # Ensure output folder exists
31 | mkdir -p "$OUTPUT_FOLDER"
32 | 
33 | # Loop from 0 to NUM_DEVICES and run the marker command in parallel
34 | for (( i=0; i<$NUM_DEVICES; i++ )); do
35 |     DEVICE_NUM=$i
36 |     export DEVICE_NUM
37 |     export NUM_DEVICES
38 |     export NUM_WORKERS
39 |     echo "Running marker on GPU $DEVICE_NUM"
40 |     cmd="CUDA_VISIBLE_DEVICES=$DEVICE_NUM marker $INPUT_FOLDER --output_dir $OUTPUT_FOLDER --num_chunks $NUM_DEVICES --chunk_idx $DEVICE_NUM --workers $NUM_WORKERS"
41 |     eval $cmd &
42 | 
43 |     sleep 5
44 | done
45 | 
46 | # Wait for all background processes to finish
47 | wait
```

--------------------------------------------------------------------------------
/marker/schema/blocks/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | from __future__ import annotations
 2 | 
 3 | from marker.schema.blocks.base import Block, BlockId, BlockOutput
 4 | from marker.schema.blocks.caption import Caption
 5 | from marker.schema.blocks.code import Code
 6 | from marker.schema.blocks.figure import Figure
 7 | from marker.schema.blocks.footnote import Footnote
 8 | from marker.schema.blocks.form import Form
 9 | from marker.schema.blocks.equation import Equation
10 | from marker.schema.blocks.handwriting import Handwriting
11 | from marker.schema.blocks.inlinemath import InlineMath
12 | from marker.schema.blocks.listitem import ListItem
13 | from marker.schema.blocks.pagefooter import PageFooter
14 | from marker.schema.blocks.pageheader import PageHeader
15 | from marker.schema.blocks.picture import Picture
16 | from marker.schema.blocks.sectionheader import SectionHeader
17 | from marker.schema.blocks.table import Table
18 | from marker.schema.blocks.text import Text
19 | from marker.schema.blocks.toc import TableOfContents
20 | from marker.schema.blocks.complexregion import ComplexRegion
21 | from marker.schema.blocks.tablecell import TableCell
22 | from marker.schema.blocks.reference import Reference
23 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/tablecell.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import List
 2 | 
 3 | from marker.schema import BlockTypes
 4 | from marker.schema.blocks import Block
 5 | 
 6 | 
 7 | class TableCell(Block):
 8 |     block_type: BlockTypes = BlockTypes.TableCell
 9 |     rowspan: int
10 |     colspan: int
11 |     row_id: int
12 |     col_id: int
13 |     is_header: bool
14 |     text_lines: List[str] | None = None
15 |     block_description: str = "A cell in a table."
16 | 
17 |     @property
18 |     def text(self):
19 |         return "\n".join(self.text_lines)
20 | 
21 |     def assemble_html(
22 |         self, document, child_blocks, parent_structure=None, block_config=None
23 |     ):
24 |         add_cell_id = block_config and block_config.get("add_block_ids", False)
25 | 
26 |         tag_cls = "th" if self.is_header else "td"
27 |         tag = f"<{tag_cls}"
28 |         if self.rowspan > 1:
29 |             tag += f" rowspan={self.rowspan}"
30 |         if self.colspan > 1:
31 |             tag += f" colspan={self.colspan}"
32 |         if add_cell_id:
33 |             tag += f' data-block-id="{self.id}"'
34 |         if self.text_lines is None:
35 |             self.text_lines = []
36 |         text = "<br>".join(self.text_lines)
37 |         return f"{tag}>{text}</{tag_cls}>"
38 | 
```

--------------------------------------------------------------------------------
/benchmarks/verify_scores.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | import argparse
 3 | 
 4 | 
 5 | def verify_scores(file_path):
 6 |     with open(file_path, 'r') as file:
 7 |         data = json.load(file)
 8 | 
 9 |     raw_scores = [data["scores"][k] for k in data["scores"]]
10 |     marker_scores = [r["marker"]["heuristic"]["score"] for r in raw_scores]
11 |     marker_score = sum(marker_scores) / len(marker_scores)
12 |     if marker_score < 90:
13 |         raise ValueError("Marker score below 90")
14 | 
15 | 
16 | def verify_table_scores(file_path):
17 |     with open(file_path, 'r') as file:
18 |         data = json.load(file)
19 | 
20 |     avg = sum([r["marker_score"] for r in data["marker"]]) / len(data)
21 |     if avg < 0.7:
22 |         raise ValueError("Average score is below the required threshold of 0.7")
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     parser = argparse.ArgumentParser(description="Verify benchmark scores")
27 |     parser.add_argument("file_path", type=str, help="Path to the json file")
28 |     parser.add_argument("--type", type=str, help="Type of file to verify", default="marker")
29 |     args = parser.parse_args()
30 |     if args.type == "marker":
31 |         verify_scores(args.file_path)
32 |     elif args.type == "table":
33 |         verify_table_scores(args.file_path)
34 | 
```

--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/breaking-bug-report.md:
--------------------------------------------------------------------------------

```markdown
 1 | ---
 2 | name: Breaking bug report
 3 | about: Create a report about a breaking bug
 4 | title: "[BUG: Breaking]"
 5 | labels: 'bug: breaking'
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## 🧨 Describe the Bug
11 | 
12 | A clear and concise description of the breaking issue (e.g., crash, OOM, exception, etc).
13 | 
14 | ## 📄 Input Document
15 | 
16 | Attach the PDF or input file that triggered the error.
17 | 
18 | ## 📤 Output Trace / Stack Trace
19 | 
20 | Paste the **complete** stack trace or error output, if available.
21 | 
22 | <details>
23 | <summary>Click to expand</summary>
24 | 
25 | ```
26 | Paste stack trace here
27 | ```
28 | 
29 | </details>
30 | 
31 | ## ⚙️ Environment
32 | 
33 | Please fill in all relevant details:
34 | 
35 | - **Marker version**: 
36 | - **Surya version**: 
37 | - **Python version**: 
38 | - **PyTorch version**: 
39 | - **Transformers version**: 
40 | - **Operating System** (incl. container info if relevant): 
41 | 
42 | ## ✅ Expected Behavior
43 | 
44 | What did you expect Marker to do?
45 | 
46 | ## 📟 Command or Code Used
47 | 
48 | Paste the **exact bash command** or **Python code** you used to run Marker:
49 | 
50 | <details>
51 | <summary>Click to expand</summary>
52 | 
53 | ```bash
54 | # or Python code block
55 | your_command_here --with-flags
56 | ```
57 | 
58 | </details>
59 | 
60 | ## 📎 Additional Context
61 | 
62 | Any other context that might help us debug this (e.g., CLI options, working directory, runtime settings).
63 | 
```

--------------------------------------------------------------------------------
/marker/models.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | 
 3 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = (
 4 |     "1"  # Transformers uses .isin for an op, which is not supported on MPS
 5 | )
 6 | 
 7 | from surya.foundation import FoundationPredictor
 8 | from surya.detection import DetectionPredictor
 9 | from surya.layout import LayoutPredictor
10 | from surya.ocr_error import OCRErrorPredictor
11 | from surya.recognition import RecognitionPredictor
12 | from surya.table_rec import TableRecPredictor
13 | from surya.settings import settings as surya_settings
14 | 
15 | 
16 | def create_model_dict(
17 |     device=None, dtype=None, attention_implementation: str | None = None
18 | ) -> dict:
19 |     return {
20 |         "layout_model": LayoutPredictor(FoundationPredictor(checkpoint=surya_settings.LAYOUT_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)),
21 |         "recognition_model": RecognitionPredictor(FoundationPredictor(checkpoint=surya_settings.RECOGNITION_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)),
22 |         "table_rec_model": TableRecPredictor(device=device, dtype=dtype),
23 |         "detection_model": DetectionPredictor(device=device, dtype=dtype),
24 |         "ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype),
25 |     }
26 | 
```

--------------------------------------------------------------------------------
/marker/extractors/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Annotated, Sequence
 2 | 
 3 | from marker.schema import BlockTypes
 4 | from marker.schema.document import Document
 5 | from marker.schema.groups import PageGroup
 6 | from PIL import Image
 7 | 
 8 | from marker.services import BaseService
 9 | from marker.util import assign_config
10 | 
11 | 
12 | class BaseExtractor:
13 |     """
14 |     An extractor that uses a provided service to extract structured data from documents.
15 |     """
16 | 
17 |     max_concurrency: Annotated[
18 |         int,
19 |         "The maximum number of concurrent requests to make to the Gemini model.",
20 |     ] = 3
21 |     disable_tqdm: Annotated[
22 |         bool,
23 |         "Whether to disable the tqdm progress bar.",
24 |     ] = False
25 | 
26 |     def __init__(self, llm_service: BaseService, config=None):
27 |         assign_config(self, config)
28 |         self.llm_service = llm_service
29 | 
30 |     def extract_image(
31 |         self,
32 |         document: Document,
33 |         page: PageGroup,
34 |         remove_blocks: Sequence[BlockTypes] | None = None,
35 |         highres: bool = False,  # Default False to save tokens
36 |     ) -> Image.Image:
37 |         return page.get_image(
38 |             document,
39 |             highres=highres,
40 |             remove_blocks=remove_blocks,
41 |         )
42 | 
43 |     def __call__(self, document: Document, *args, **kwargs):
44 |         raise NotImplementedError
45 | 
```

--------------------------------------------------------------------------------
/marker/scripts/file_to_s3.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | import shutil
 3 | import datetime
 4 | from pathlib import Path
 5 | import boto3
 6 | 
 7 | from huggingface_hub import snapshot_download
 8 | 
 9 | import click
10 | 
11 | S3_API_URL = "https://1afbe4656a6b40d982ab5e730a39f6b9.r2.cloudflarestorage.com"
12 | 
13 | @click.command(help="Uploads files to an S3 bucket")
14 | @click.argument("filepath", type=str)
15 | @click.argument("s3_path", type=str)
16 | @click.option("--bucket_name", type=str, default="datalab")
17 | @click.option("--access_key_id", type=str, default="<access_key_id>")
18 | @click.option("--access_key_secret", type=str, default="<access_key_secret>")
19 | def main(filepath: str, s3_path: str, bucket_name: str, access_key_id: str, access_key_secret: str):
20 |     filepath = Path(filepath)
21 |     # Upload the files to S3
22 |     s3_client = boto3.client(
23 |         's3',
24 |         endpoint_url=S3_API_URL,
25 |         aws_access_key_id=access_key_id,
26 |         aws_secret_access_key=access_key_secret,
27 |         region_name="enam"
28 |     )
29 | 
30 |     s3_key = f"{s3_path}/{filepath.name}"
31 | 
32 |     try:
33 |         s3_client.upload_file(
34 |             str(filepath),
35 |             bucket_name,
36 |             s3_key
37 |         )
38 |     except Exception as e:
39 |         print(f"Error uploading {filepath}: {str(e)}")
40 | 
41 |     print(f"Uploaded files to {s3_path}")
42 | 
43 | if __name__ == "__main__":
44 |     main()
45 | 
46 | 
47 | 
48 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/methods/marker.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | import tempfile
 3 | import time
 4 | 
 5 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult
 6 | from marker.config.parser import ConfigParser
 7 | from marker.converters.pdf import PdfConverter
 8 | 
 9 | 
10 | class MarkerMethod(BaseMethod):
11 |     model_dict: dict = None
12 |     use_llm: bool = False
13 | 
14 |     def __call__(self, sample) -> BenchmarkResult:
15 |         pdf_bytes = sample["pdf"]  # This is a single page PDF
16 |         parser = ConfigParser({
17 |                 "page_range": "0",
18 |                 "disable_tqdm": True,
19 |                 "use_llm": self.use_llm,
20 |                 "redo_inline_math": self.use_llm,
21 |                 "llm_service": "marker.services.vertex.GoogleVertexService",
22 |                 "vertex_project_id": os.getenv("VERTEX_PROJECT_ID"),
23 |             })
24 | 
25 |         block_converter = PdfConverter(
26 |             artifact_dict=self.model_dict,
27 |             config=parser.generate_config_dict(),
28 |             llm_service=parser.get_llm_service()
29 |         )
30 | 
31 |         with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
32 |             f.write(pdf_bytes)
33 |             start = time.time()
34 |             rendered = block_converter(f.name)
35 |             total = time.time() - start
36 | 
37 |         return {
38 |             "markdown": rendered.markdown,
39 |             "time": total
40 |         }
41 | 
42 | 
```

--------------------------------------------------------------------------------
/marker/utils/image.py:
--------------------------------------------------------------------------------

```python
 1 | from PIL import Image
 2 | import numpy as np
 3 | import cv2
 4 | from typing import List, Optional
 5 | 
 6 | def is_blank_image(image: Image.Image, polygon: Optional[List[List[int]]] = None) -> bool:
 7 |     image = np.asarray(image)
 8 |     if (
 9 |         image is None
10 |         or image.size == 0
11 |         or image.shape[0] == 0
12 |         or image.shape[1] == 0
13 |     ):
14 |         # Handle empty image case
15 |         return True
16 | 
17 |     if polygon is not None:
18 |         rounded_polys = [[int(corner[0]), int(corner[1])] for corner in polygon]
19 |         if rounded_polys[0] == rounded_polys[1] and rounded_polys[2] == rounded_polys[3]:
20 |             return True
21 | 
22 |     gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
23 |     gray = cv2.GaussianBlur(gray, (7, 7), 0)
24 | 
25 |     # Adaptive threshold (inverse for text as white)
26 |     binarized = cv2.adaptiveThreshold(
27 |         gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15
28 |     )
29 | 
30 |     num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
31 |         binarized, connectivity=8
32 |     )
33 |     cleaned = np.zeros_like(binarized)
34 |     for i in range(1, num_labels):  # skip background
35 |         cleaned[labels == i] = 255
36 | 
37 |     kernel = np.ones((1, 5), np.uint8)
38 |     dilated = cv2.dilate(cleaned, kernel, iterations=3)
39 |     b = dilated / 255
40 |     return bool(b.sum() == 0)
```

--------------------------------------------------------------------------------
/marker/schema/blocks/text.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.blocks import Block
 3 | 
 4 | 
 5 | class Text(Block):
 6 |     block_type: BlockTypes = BlockTypes.Text
 7 |     has_continuation: bool = False
 8 |     blockquote: bool = False
 9 |     blockquote_level: int = 0
10 |     html: str | None = None
11 |     block_description: str = "A paragraph or line of text."
12 | 
13 |     def assemble_html(
14 |         self, document, child_blocks, parent_structure, block_config=None
15 |     ):
16 |         if self.ignore_for_output:
17 |             return ""
18 | 
19 |         # This happens when we used an llm processor
20 |         if self.html:
21 |             return super().handle_html_output(
22 |                 document, child_blocks, parent_structure, block_config
23 |             )
24 | 
25 |         template = super().assemble_html(
26 |             document, child_blocks, parent_structure, block_config
27 |         )
28 |         template = template.replace("\n", " ")
29 | 
30 |         el_attr = f" block-type='{self.block_type}'"
31 |         if self.has_continuation:
32 |             el_attr += " class='has-continuation'"
33 | 
34 |         if self.blockquote:
35 |             blockquote_prefix = "<blockquote>" * self.blockquote_level
36 |             blockquote_suffix = "</blockquote>" * self.blockquote_level
37 |             return f"{blockquote_prefix}<p{el_attr}>{template}</p>{blockquote_suffix}"
38 |         else:
39 |             return f"<p{el_attr}>{template}</p>"
40 | 
```

--------------------------------------------------------------------------------
/.github/workflows/cla.yml:
--------------------------------------------------------------------------------

```yaml
 1 | name: "Marker CLA Assistant"
 2 | on:
 3 |   issue_comment:
 4 |     types: [created]
 5 |   pull_request_target:
 6 |     types: [opened,closed,synchronize]
 7 | 
 8 | # explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings
 9 | permissions:
10 |   actions: write
11 |   contents: write
12 |   pull-requests: write
13 |   statuses: write
14 | 
15 | jobs:
16 |   CLAAssistant:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - name: "Marker CLA Assistant"
20 |         if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target'
21 |         uses: contributor-assistant/[email protected]
22 |         env:
23 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
24 |           # the below token should have repo scope and must be manually added by you in the repository's secret
25 |           # This token is required only if you have configured to store the signatures in a remote repository/organization
26 |           PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
27 |         with:
28 |           path-to-signatures: 'signatures/version1/cla.json'
29 |           path-to-document: 'https://github.com/VikParuchuri/marker/blob/master/CLA.md'
30 |           # branch should not be protected
31 |           branch: 'master'
32 |           allowlist: VikParuchuri,Sandy
```

--------------------------------------------------------------------------------
/tests/builders/test_overriding.py:
--------------------------------------------------------------------------------

```python
 1 | import multiprocessing as mp
 2 | 
 3 | import pytest
 4 | 
 5 | from marker.providers.pdf import PdfProvider
 6 | from marker.schema import BlockTypes
 7 | from marker.schema.blocks import SectionHeader
 8 | from marker.schema.document import Document
 9 | from marker.schema.registry import register_block_class
10 | from marker.schema.text import Line
11 | from tests.utils import setup_pdf_provider
12 | 
13 | 
14 | class NewSectionHeader(SectionHeader):
15 |     pass
16 | 
17 | 
18 | class NewLine(Line):
19 |     pass
20 | 
21 | 
22 | @pytest.mark.config({
23 |     "page_range": [0],
24 |     "override_map": {BlockTypes.SectionHeader: NewSectionHeader}
25 | })
26 | def test_overriding(pdf_document: Document):
27 |     assert pdf_document.pages[0]\
28 |         .get_block(pdf_document.pages[0].structure[0]).__class__ == NewSectionHeader
29 | 
30 | 
31 | def get_lines(pdf: str, config=None):
32 |     for block_type, block_cls in config["override_map"].items():
33 |         register_block_class(block_type, block_cls)
34 | 
35 |     provider: PdfProvider = setup_pdf_provider(pdf, config)
36 |     return provider.get_page_lines(0)
37 | 
38 | 
39 | def test_overriding_mp():
40 |     config = {
41 |         "page_range": [0],
42 |         "override_map": {BlockTypes.Line: NewLine}
43 |     }
44 | 
45 |     pdf_list = ["adversarial.pdf", "adversarial_rot.pdf"]
46 | 
47 |     with mp.Pool(processes=2) as pool:
48 |         results = pool.starmap(get_lines, [(pdf, config) for pdf in pdf_list])
49 |         assert all([r[0].line.__class__ == NewLine for r in results])
50 | 
```

--------------------------------------------------------------------------------
/.github/workflows/scripts.yml:
--------------------------------------------------------------------------------

```yaml
 1 | name: Test CLI scripts
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   tests:
 7 |     runs-on: t4_gpu
 8 |     steps:
 9 |       - uses: actions/checkout@v3
10 |       - name: Set up Python 3.11
11 |         uses: actions/setup-python@v4
12 |         with:
13 |           python-version: 3.11
14 |       - name: Install python dependencies
15 |         run: |
16 |           pip install poetry
17 |           poetry install --extras "full"
18 |       - name: Download benchmark data
19 |         run: |
20 |           wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi"
21 |           unzip -o benchmark_data.zip
22 |       - name: Test single script
23 |         run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0
24 |       - name: Test convert script
25 |         run: poetry run marker benchmark_data/pdfs --max_files 1 --page_range 0
26 |       - name: Text convert script multiple workers
27 |         run: poetry run marker benchmark_data/pdfs --max_files 2 --page_range 0-5
28 |       - name: Test llm option
29 |         run: |
30 |           poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 --use_llm > output.txt || echo "Command failed but continuing"
31 |           if ! grep -q "UserWarning" output.txt; then
32 |             echo "Success: No UserWarning found"
33 |             exit 0
34 |           else
35 |             echo "Error: UserWarning found in output"
36 |             exit 1
37 |           fi
```

--------------------------------------------------------------------------------
/marker/processors/footnote.py:
--------------------------------------------------------------------------------

```python
 1 | import re
 2 | 
 3 | from marker.processors import BaseProcessor
 4 | from marker.schema import BlockTypes
 5 | from marker.schema.document import Document
 6 | from marker.schema.groups import PageGroup
 7 | 
 8 | 
 9 | class FootnoteProcessor(BaseProcessor):
10 |     """
11 |     A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks.
12 |     """
13 |     block_types = (BlockTypes.Footnote,)
14 | 
15 |     def __call__(self, document: Document):
16 |         for page in document.pages:
17 |             self.push_footnotes_to_bottom(page, document)
18 |             self.assign_superscripts(page, document)
19 | 
20 |     def push_footnotes_to_bottom(self, page: PageGroup, document: Document):
21 |         footnote_blocks = page.contained_blocks(document, self.block_types)
22 | 
23 |         # Push footnotes to the bottom
24 |         for block in footnote_blocks:
25 |             # Check if it is top-level
26 |             if block.id in page.structure:
27 |                 # Move to bottom if it is
28 |                 page.structure.remove(block.id)
29 |                 page.add_structure(block)
30 | 
31 |     def assign_superscripts(self, page: PageGroup, document: Document):
32 |         footnote_blocks = page.contained_blocks(document, self.block_types)
33 | 
34 |         for block in footnote_blocks:
35 |             for span in block.contained_blocks(document, (BlockTypes.Span,)):
36 |                 if re.match(r"^[0-9\W]+", span.text):
37 |                     span.has_superscript = True
38 |                 break
39 | 
```

--------------------------------------------------------------------------------
/marker/scripts/convert_single.py:
--------------------------------------------------------------------------------

```python
 1 | import os
 2 | 
 3 | os.environ["GRPC_VERBOSITY"] = "ERROR"
 4 | os.environ["GLOG_minloglevel"] = "2"
 5 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = (
 6 |     "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
 7 | )
 8 | 
 9 | import time
10 | import click
11 | 
12 | from marker.config.parser import ConfigParser
13 | from marker.config.printer import CustomClickPrinter
14 | from marker.logger import configure_logging, get_logger
15 | from marker.models import create_model_dict
16 | from marker.output import save_output
17 | 
18 | configure_logging()
19 | logger = get_logger()
20 | 
21 | 
22 | @click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.")
23 | @click.argument("fpath", type=str)
24 | @ConfigParser.common_options
25 | def convert_single_cli(fpath: str, **kwargs):
26 |     models = create_model_dict()
27 |     start = time.time()
28 |     config_parser = ConfigParser(kwargs)
29 | 
30 |     converter_cls = config_parser.get_converter_cls()
31 |     converter = converter_cls(
32 |         config=config_parser.generate_config_dict(),
33 |         artifact_dict=models,
34 |         processor_list=config_parser.get_processors(),
35 |         renderer=config_parser.get_renderer(),
36 |         llm_service=config_parser.get_llm_service(),
37 |     )
38 |     rendered = converter(fpath)
39 |     out_folder = config_parser.get_output_folder(fpath)
40 |     save_output(rendered, out_folder, config_parser.get_base_filename(fpath))
41 | 
42 |     logger.info(f"Saved markdown to {out_folder}")
43 |     logger.info(f"Total time: {time.time() - start}")
44 | 
```

--------------------------------------------------------------------------------
/marker/schema/blocks/inlinemath.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.schema import BlockTypes
 2 | from marker.schema.blocks import Block
 3 | 
 4 | 
 5 | class InlineMath(Block):
 6 |     block_type: BlockTypes = BlockTypes.TextInlineMath
 7 |     has_continuation: bool = False
 8 |     blockquote: bool = False
 9 |     blockquote_level: int = 0
10 |     block_description: str = "A text block that contains inline math.  This is not used for italic text or references - only for text that contains math."
11 |     html: str | None = None
12 | 
13 |     def assemble_html(
14 |         self, document, child_blocks, parent_structure, block_config=None
15 |     ):
16 |         if self.ignore_for_output:
17 |             return ""
18 | 
19 |         if self.html:
20 |             return super().handle_html_output(
21 |                 document, child_blocks, parent_structure, block_config
22 |             )
23 | 
24 |         template = super().assemble_html(
25 |             document, child_blocks, parent_structure, block_config
26 |         )
27 |         template = template.replace("\n", " ")
28 | 
29 |         el_attr = f" block-type='{self.block_type}'"
30 |         if self.has_continuation:
31 |             el_attr += " class='has-continuation'"
32 | 
33 |         if self.blockquote:
34 |             # Add indentation for blockquote levels
35 |             blockquote_prefix = "<blockquote>" * self.blockquote_level
36 |             blockquote_suffix = "</blockquote>" * self.blockquote_level
37 |             return f"{blockquote_prefix}<p{el_attr}>{template}</p>{blockquote_suffix}"
38 |         else:
39 |             return f"<p{el_attr}>{template}</p>"
40 | 
```

--------------------------------------------------------------------------------
/tests/builders/test_layout_replace.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.builders.document import DocumentBuilder
 4 | from marker.builders.layout import LayoutBuilder
 5 | from marker.builders.line import LineBuilder
 6 | from marker.renderers.markdown import MarkdownRenderer
 7 | from marker.schema import BlockTypes
 8 | from marker.schema.registry import get_block_class
 9 | 
10 | 
11 | @pytest.mark.filename("thinkpython.pdf")
12 | @pytest.mark.config({"page_range": [0]})
13 | def test_layout_replace(
14 |     request, config, doc_provider, layout_model, ocr_error_model, detection_model
15 | ):
16 |     # The llm layout builder replaces blocks - this makes sure text is still merged properly
17 |     layout_builder = LayoutBuilder(layout_model, config)
18 |     line_builder = LineBuilder(detection_model, ocr_error_model, config)
19 |     builder = DocumentBuilder(config)
20 |     document = builder.build_document(doc_provider)
21 |     layout_builder(document, doc_provider)
22 |     page = document.pages[0]
23 |     new_blocks = []
24 |     for block in page.contained_blocks(document, (BlockTypes.Text,)):
25 |         generated_block_class = get_block_class(BlockTypes.TextInlineMath)
26 |         generated_block = generated_block_class(
27 |             polygon=block.polygon,
28 |             page_id=block.page_id,
29 |             structure=block.structure,
30 |         )
31 |         page.replace_block(block, generated_block)
32 |         new_blocks.append(generated_block)
33 |     line_builder(document, doc_provider)
34 | 
35 |     for block in new_blocks:
36 |         assert block.raw_text(document).strip()
37 | 
38 |     renderer = MarkdownRenderer(config)
39 |     rendered = renderer(document)
40 | 
41 |     assert "Think Python" in rendered.markdown
42 | 
```

--------------------------------------------------------------------------------
/marker/processors/code.py:
--------------------------------------------------------------------------------

```python
 1 | from marker.processors import BaseProcessor
 2 | from marker.schema import BlockTypes
 3 | from marker.schema.blocks import Code
 4 | from marker.schema.document import Document
 5 | 
 6 | 
 7 | class CodeProcessor(BaseProcessor):
 8 |     """
 9 |     A processor for formatting code blocks.
10 |     """
11 |     block_types = (BlockTypes.Code, )
12 | 
13 |     def __call__(self, document: Document):
14 |         for page in document.pages:
15 |             for block in page.contained_blocks(document, self.block_types):
16 |                 self.format_block(document, block)
17 | 
18 | 
19 |     def format_block(self, document: Document, block: Code):
20 |         min_left = 9999  # will contain x- coord of column 0
21 |         total_width = 0
22 |         total_chars = 0
23 |         
24 |         contained_lines = block.contained_blocks(document, (BlockTypes.Line,))
25 |         for line in contained_lines:
26 |             min_left = min(line.polygon.bbox[0], min_left)
27 |             total_width += line.polygon.width
28 |             total_chars += len(line.raw_text(document))
29 | 
30 |         avg_char_width = total_width / max(total_chars, 1)
31 |         code_text = ""
32 |         is_new_line = False
33 |         for line in contained_lines:
34 |             text = line.raw_text(document)
35 |             if avg_char_width == 0:
36 |                 prefix = ""
37 |             else:
38 |                 total_spaces = int((line.polygon.bbox[0] - min_left) / avg_char_width)
39 |                 prefix = " " * max(0, total_spaces)
40 | 
41 |             if is_new_line:
42 |                 text = prefix + text
43 | 
44 |             code_text += text
45 |             is_new_line = text.endswith("\n")
46 | 
47 |         block.code = code_text.rstrip()
48 | 
```

--------------------------------------------------------------------------------
/marker/settings.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Optional
 2 | 
 3 | from dotenv import find_dotenv
 4 | from pydantic import computed_field
 5 | from pydantic_settings import BaseSettings
 6 | import torch
 7 | import os
 8 | 
 9 | 
10 | class Settings(BaseSettings):
11 |     # Paths
12 |     BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
13 |     OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results")
14 |     FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts")
15 |     DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data")
16 |     ARTIFACT_URL: str = "https://models.datalab.to/artifacts"
17 |     FONT_NAME: str = "GoNotoCurrent-Regular.ttf"
18 |     FONT_PATH: str = os.path.join(FONT_DIR, FONT_NAME)
19 |     LOGLEVEL: str = "INFO"
20 | 
21 |     # General
22 |     OUTPUT_ENCODING: str = "utf-8"
23 |     OUTPUT_IMAGE_FORMAT: str = "JPEG"
24 | 
25 |     # LLM
26 |     GOOGLE_API_KEY: Optional[str] = ""
27 | 
28 |     # General models
29 |     TORCH_DEVICE: Optional[str] = (
30 |         None  # Note: MPS device does not work for text detection, and will default to CPU
31 |     )
32 | 
33 |     @computed_field
34 |     @property
35 |     def TORCH_DEVICE_MODEL(self) -> str:
36 |         if self.TORCH_DEVICE is not None:
37 |             return self.TORCH_DEVICE
38 | 
39 |         if torch.cuda.is_available():
40 |             return "cuda"
41 | 
42 |         if torch.backends.mps.is_available():
43 |             return "mps"
44 | 
45 |         return "cpu"
46 | 
47 |     @computed_field
48 |     @property
49 |     def MODEL_DTYPE(self) -> torch.dtype:
50 |         if self.TORCH_DEVICE_MODEL == "cuda":
51 |             return torch.bfloat16
52 |         else:
53 |             return torch.float32
54 | 
55 |     class Config:
56 |         env_file = find_dotenv("local.env")
57 |         extra = "ignore"
58 | 
59 | 
60 | settings = Settings()
61 | 
```

--------------------------------------------------------------------------------
/marker/converters/ocr.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Tuple
 2 | 
 3 | from marker.builders.document import DocumentBuilder
 4 | from marker.builders.line import LineBuilder
 5 | from marker.builders.ocr import OcrBuilder
 6 | from marker.converters.pdf import PdfConverter
 7 | from marker.processors import BaseProcessor
 8 | from marker.processors.equation import EquationProcessor
 9 | from marker.providers.registry import provider_from_filepath
10 | from marker.renderers.ocr_json import OCRJSONRenderer
11 | 
12 | 
13 | class OCRConverter(PdfConverter):
14 |     default_processors: Tuple[BaseProcessor, ...] = (EquationProcessor,)
15 | 
16 |     def __init__(self, *args, **kwargs):
17 |         super().__init__(*args, **kwargs)
18 | 
19 |         if not self.config:
20 |             self.config = {}
21 | 
22 |         self.config["force_ocr"] = True
23 |         self.renderer = OCRJSONRenderer
24 | 
25 |     def build_document(self, filepath: str):
26 |         provider_cls = provider_from_filepath(filepath)
27 |         layout_builder = self.resolve_dependencies(self.layout_builder_class)
28 |         line_builder = self.resolve_dependencies(LineBuilder)
29 |         ocr_builder = self.resolve_dependencies(OcrBuilder)
30 |         document_builder = DocumentBuilder(self.config)
31 | 
32 |         provider = provider_cls(filepath, self.config)
33 |         document = document_builder(provider, layout_builder, line_builder, ocr_builder)
34 | 
35 |         for processor in self.processor_list:
36 |             processor(document)
37 | 
38 |         return document
39 | 
40 |     def __call__(self, filepath: str):
41 |         document = self.build_document(filepath)
42 |         self.page_count = len(document.pages)
43 |         renderer = self.resolve_dependencies(self.renderer)
44 |         return renderer(document)
45 | 
```

--------------------------------------------------------------------------------
/tests/builders/test_document_builder.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.schema import BlockTypes
 4 | from marker.schema.text.line import Line
 5 | 
 6 | 
 7 | @pytest.mark.filename("thinkpython.pdf")
 8 | @pytest.mark.config({"page_range": [0]})
 9 | def test_document_builder(pdf_document):
10 |     first_page = pdf_document.pages[0]
11 |     assert first_page.structure[0] == "/page/0/SectionHeader/0"
12 | 
13 |     first_block = first_page.get_block(first_page.structure[0])
14 |     assert first_block.block_type == BlockTypes.SectionHeader
15 |     assert first_block.text_extraction_method == "pdftext"
16 | 
17 |     first_text_block: Line = first_page.get_block(first_block.structure[0])
18 |     assert first_text_block.block_type == BlockTypes.Line
19 | 
20 |     first_span = first_page.get_block(first_text_block.structure[0])
21 |     assert first_span.block_type == BlockTypes.Span
22 |     assert first_span.text == "Think Python"
23 |     assert first_span.font == "URWPalladioL-Roma"
24 |     assert first_span.formats == ["plain"]
25 | 
26 | 
27 | @pytest.mark.config({"page_range": [0]})
28 | def test_document_builder_inline_eq(pdf_document):
29 |     first_page = pdf_document.pages[0]
30 |     assert first_page.structure[0] == "/page/0/SectionHeader/0"
31 | 
32 |     first_block = first_page.get_block(first_page.structure[0])
33 |     assert first_block.block_type == BlockTypes.SectionHeader
34 |     assert first_block.text_extraction_method == "surya"
35 | 
36 |     first_text_block: Line = first_page.get_block(first_block.structure[0])
37 |     assert first_text_block.block_type == BlockTypes.Line
38 | 
39 |     first_span = first_page.get_block(first_text_block.structure[0])
40 |     assert first_span.block_type == BlockTypes.Span
41 |     assert first_span.text.strip() == "Subspace Adversarial Training"
42 |     assert "bold" in first_span.formats
43 | 
```

--------------------------------------------------------------------------------
/marker/services/__init__.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Optional, List, Annotated
 2 | from io import BytesIO
 3 | 
 4 | import PIL
 5 | from pydantic import BaseModel
 6 | 
 7 | from marker.schema.blocks import Block
 8 | from marker.util import assign_config, verify_config_keys
 9 | import base64
10 | 
11 | 
12 | class BaseService:
13 |     timeout: Annotated[int, "The timeout to use for the service."] = 30
14 |     max_retries: Annotated[
15 |         int, "The maximum number of retries to use for the service."
16 |     ] = 2
17 |     retry_wait_time: Annotated[int, "The wait time between retries."] = 3
18 |     max_output_tokens: Annotated[
19 |         int, "The maximum number of output tokens to generate."
20 |     ] = None
21 | 
22 |     def img_to_base64(self, img: PIL.Image.Image, format: str = "WEBP"):
23 |         image_bytes = BytesIO()
24 |         img.save(image_bytes, format=format)
25 |         return base64.b64encode(image_bytes.getvalue()).decode("utf-8")
26 | 
27 |     def process_images(self, images: List[PIL.Image.Image]) -> list:
28 |         raise NotImplementedError
29 | 
30 |     def format_image_for_llm(self, image):
31 |         if not image:
32 |             return []
33 | 
34 |         if not isinstance(image, list):
35 |             image = [image]
36 | 
37 |         image_parts = self.process_images(image)
38 |         return image_parts
39 | 
40 |     def __init__(self, config: Optional[BaseModel | dict] = None):
41 |         assign_config(self, config)
42 | 
43 |         # Ensure we have all necessary fields filled out (API keys, etc.)
44 |         verify_config_keys(self)
45 | 
46 |     def __call__(
47 |         self,
48 |         prompt: str,
49 |         image: PIL.Image.Image | List[PIL.Image.Image] | None,
50 |         block: Block | None,
51 |         response_schema: type[BaseModel],
52 |         max_retries: int | None = None,
53 |         timeout: int | None = None,
54 |     ):
55 |         raise NotImplementedError
56 | 
```

--------------------------------------------------------------------------------
/tests/providers/test_document_providers.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.mark.config({"page_range": [0]})
 5 | @pytest.mark.filename("lambda.pptx")
 6 | def test_pptx_provider(doc_provider):
 7 |     assert doc_provider.get_images([0], 72)[0].size == (842, 596)
 8 | 
 9 |     page_lines = doc_provider.get_page_lines(0)
10 | 
11 |     spans = page_lines[0].spans
12 |     assert spans[0].text == "Lambda Calculus"
13 | 
14 |     spans = page_lines[1].spans
15 |     assert spans[0].text == "CSE 340 – Principles of Programming Languages"
16 | 
17 | 
18 | @pytest.mark.config({"page_range": [0]})
19 | @pytest.mark.filename("manual.epub")
20 | def test_epub_provider(doc_provider):
21 |     assert doc_provider.get_images([0], 72)[0].size == (596, 842)
22 | 
23 |     page_lines = doc_provider.get_page_lines(0)
24 | 
25 |     spans = page_lines[0].spans
26 |     assert spans[0].text == "The Project Gutenberg eBook of Simple"
27 | 
28 | 
29 | @pytest.mark.config({"page_range": [0]})
30 | @pytest.mark.filename("china.html")
31 | def test_html_provider(doc_provider):
32 |     assert doc_provider.get_images([0], 72)[0].size == (596, 842)
33 | 
34 |     page_lines = doc_provider.get_page_lines(0)
35 | 
36 |     spans = page_lines[0].spans
37 |     assert spans[0].text == "Jump to content"
38 | 
39 | @pytest.mark.config({"page_range": [0]})
40 | @pytest.mark.filename("gatsby.docx")
41 | def test_docx_provider(doc_provider):
42 |     assert doc_provider.get_images([0], 72)[0].size == (596, 842)
43 | 
44 |     page_lines = doc_provider.get_page_lines(0)
45 | 
46 |     spans = page_lines[0].spans
47 |     assert spans[0].text == "Themes"
48 | 
49 | 
50 | @pytest.mark.config({"page_range": [0]})
51 | @pytest.mark.filename("single_sheet.xlsx")
52 | def test_xlsx_provider(doc_provider):
53 |     assert doc_provider.get_images([0], 72)[0].size == (842, 596)
54 | 
55 |     page_lines = doc_provider.get_page_lines(0)
56 | 
57 |     spans = page_lines[0].spans
58 |     assert spans[0].text == "Sheet1"
```

--------------------------------------------------------------------------------
/marker/schema/blocks/listitem.py:
--------------------------------------------------------------------------------

```python
 1 | import re
 2 | 
 3 | from marker.schema import BlockTypes
 4 | from marker.schema.blocks import Block
 5 | 
 6 | 
 7 | def replace_bullets(child_blocks):
 8 |     # Replace bullet characters with a -
 9 |     first_block = None
10 |     while len(child_blocks) > 0:
11 |         first_block = child_blocks[0]
12 |         child_blocks = first_block.children
13 | 
14 |     if first_block is not None and first_block.id.block_type == BlockTypes.Line:
15 |         bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○ഠ ം◦■▪▫–—-]( )"
16 |         first_block.html = re.sub(bullet_pattern, r"\1\2", first_block.html)
17 | 
18 | 
19 | class ListItem(Block):
20 |     block_type: BlockTypes = BlockTypes.ListItem
21 |     list_indent_level: int = 0
22 |     block_description: str = "A list item that is part of a list.  This block is used to represent a single item in a list."
23 |     html: str | None = None
24 | 
25 |     def assemble_html(
26 |         self, document, child_blocks, parent_structure, block_config=None
27 |     ):
28 |         template = super().assemble_html(
29 |             document, child_blocks, parent_structure, block_config
30 |         )
31 |         template = template.replace("\n", " ")
32 |         # Remove the first bullet character
33 |         replace_bullets(child_blocks)
34 | 
35 |         if self.html:
36 |             template = (
37 |                 super()
38 |                 .handle_html_output(
39 |                     document, child_blocks, parent_structure, block_config
40 |                 )
41 |                 .strip()
42 |             )
43 |             template = template.replace("<li>", "").replace("</li>", "")
44 | 
45 |         el_attr = f" block-type='{self.block_type}'"
46 |         if self.list_indent_level:
47 |             return f"<ul><li{el_attr} class='list-indent-{self.list_indent_level}'>{template}</li></ul>"
48 |         return f"<li{el_attr}>{template}</li>"
49 | 
```

--------------------------------------------------------------------------------
/marker/providers/image.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import List, Annotated
 2 | from PIL import Image
 3 | 
 4 | from marker.providers import ProviderPageLines, BaseProvider
 5 | from marker.schema.polygon import PolygonBox
 6 | from marker.schema.text import Line
 7 | from pdftext.schema import Reference
 8 | 
 9 | 
10 | class ImageProvider(BaseProvider):
11 |     page_range: Annotated[
12 |         List[int],
13 |         "The range of pages to process.",
14 |         "Default is None, which will process all pages.",
15 |     ] = None
16 | 
17 |     image_count: int = 1
18 | 
19 |     def __init__(self, filepath: str, config=None):
20 |         super().__init__(filepath, config)
21 | 
22 |         self.images = [Image.open(filepath)]
23 |         self.page_lines: ProviderPageLines = {i: [] for i in range(self.image_count)}
24 | 
25 |         if self.page_range is None:
26 |             self.page_range = range(self.image_count)
27 | 
28 |         assert max(self.page_range) < self.image_count and min(self.page_range) >= 0, (
29 |             f"Invalid page range, values must be between 0 and {len(self.doc) - 1}.  Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}."
30 |         )
31 | 
32 |         self.page_bboxes = {
33 |             i: [0, 0, self.images[i].size[0], self.images[i].size[1]]
34 |             for i in self.page_range
35 |         }
36 | 
37 |     def __len__(self):
38 |         return self.image_count
39 | 
40 |     def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]:
41 |         return [self.images[i] for i in idxs]
42 | 
43 |     def get_page_bbox(self, idx: int) -> PolygonBox | None:
44 |         bbox = self.page_bboxes[idx]
45 |         if bbox:
46 |             return PolygonBox.from_bbox(bbox)
47 | 
48 |     def get_page_lines(self, idx: int) -> List[Line]:
49 |         return self.page_lines[idx]
50 | 
51 |     def get_page_refs(self, idx: int) -> List[Reference]:
52 |         return []
53 | 
```

--------------------------------------------------------------------------------
/tests/builders/test_ocr_pipeline.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.schema import BlockTypes
 4 | from marker.schema.text.line import Line
 5 | 
 6 | 
 7 | def _ocr_pipeline_test(pdf_document):
 8 |     first_page = pdf_document.pages[0]
 9 |     assert first_page.structure[0] == "/page/0/SectionHeader/0"
10 | 
11 |     first_block = first_page.get_block(first_page.structure[0])
12 |     assert first_block.text_extraction_method == "surya"
13 |     assert first_block.block_type == BlockTypes.SectionHeader
14 | 
15 |     first_text_block: Line = first_page.get_block(first_block.structure[0])
16 |     assert first_text_block.block_type == BlockTypes.Line
17 | 
18 |     first_span = first_page.get_block(first_text_block.structure[0])
19 |     assert first_span.block_type == BlockTypes.Span
20 |     assert first_span.text.strip() == "Subspace Adversarial Training"
21 | 
22 |     # Ensure we match all text lines up properly
23 |     # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes
24 |     text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,))
25 |     text_blocks = first_page.contained_blocks(
26 |         pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath)
27 |     )
28 |     # assert len(text_lines) == 83
29 | 
30 |     # Ensure the bbox sizes match up
31 |     max_line_position = max([line.polygon.y_end for line in text_lines])
32 |     max_block_position = max(
33 |         [block.polygon.y_end for block in text_blocks if block.source == "layout"]
34 |     )
35 |     assert max_line_position <= (max_block_position * 1.02)
36 | 
37 | 
38 | @pytest.mark.config({"force_ocr": True, "page_range": [0]})
39 | def test_ocr_pipeline(pdf_document):
40 |     _ocr_pipeline_test(pdf_document)
41 | 
42 | 
43 | @pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True})
44 | def test_ocr_with_inline_pipeline(pdf_document):
45 |     _ocr_pipeline_test(pdf_document)
46 | 
```

--------------------------------------------------------------------------------
/benchmarks/table/gemini.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | from PIL import Image
 3 | from google import genai
 4 | from google.genai import types
 5 | from io import BytesIO
 6 | from pydantic import BaseModel
 7 | 
 8 | from marker.settings import settings
 9 | 
10 | prompt = """
11 | You're an expert document analyst who is good at turning tables in documents into HTML.  Analyze the provided image, and convert it to a faithful HTML representation.
12 |  
13 | Guidelines:
14 | - Keep the HTML simple and concise.
15 | - Only include the <table> tag and contents.
16 | - Only use <table>, <tr>, and <td> tags.  Only use the colspan and rowspan attributes if necessary.  Do not use <tbody>, <thead>, or <th> tags.
17 | - Make sure the table is as faithful to the image as possible with the given tags.
18 | 
19 | **Instructions**
20 | 1. Analyze the image, and determine the table structure.
21 | 2. Convert the table image to HTML, following the guidelines above.
22 | 3. Output only the HTML for the table, starting with the <table> tag and ending with the </table> tag.
23 | """.strip()
24 | 
25 | class TableSchema(BaseModel):
26 |     table_html: str
27 | 
28 | def gemini_table_rec(image: Image.Image):
29 |     client = genai.Client(
30 |         api_key=settings.GOOGLE_API_KEY,
31 |         http_options={"timeout": 60000}
32 |     )
33 | 
34 |     image_bytes = BytesIO()
35 |     image.save(image_bytes, format="PNG")
36 | 
37 |     responses = client.models.generate_content(
38 |         model="gemini-2.0-flash",
39 |         contents=[types.Part.from_bytes(data=image_bytes.getvalue(), mime_type="image/png"), prompt],  # According to gemini docs, it performs better if the image is the first element
40 |         config={
41 |             "temperature": 0,
42 |             "response_schema": TableSchema,
43 |             "response_mime_type": "application/json",
44 |         },
45 |     )
46 | 
47 |     output = responses.candidates[0].content.parts[0].text
48 |     return json.loads(output)["table_html"]
```

--------------------------------------------------------------------------------
/tests/builders/test_pdf_links.py:
--------------------------------------------------------------------------------

```python
 1 | import re
 2 | 
 3 | import pytest
 4 | 
 5 | from marker.converters.pdf import PdfConverter
 6 | from marker.renderers.markdown import MarkdownOutput
 7 | from marker.schema import BlockTypes
 8 | from marker.schema.document import Document
 9 | from marker.util import classes_to_strings
10 | 
11 | 
12 | @pytest.mark.filename("arxiv_test.pdf")
13 | @pytest.mark.output_format("markdown")
14 | @pytest.mark.config({"disable_ocr": True})
15 | def test_pdf_links(pdf_document: Document, config, renderer, model_dict, temp_doc):
16 |     first_page = pdf_document.pages[1]
17 | 
18 |     processors = ["marker.processors.reference.ReferenceProcessor"]
19 |     pdf_converter = PdfConverter(
20 |         artifact_dict=model_dict,
21 |         processor_list=processors,
22 |         renderer=classes_to_strings([renderer])[0],
23 |         config=config,
24 |     )
25 | 
26 |     for section_header_span in first_page.contained_blocks(
27 |         pdf_document, (BlockTypes.Span,)
28 |     ):
29 |         if "II." in section_header_span.text:
30 |             assert section_header_span.url == "#page-1-0"
31 |             break
32 |     else:
33 |         raise ValueError("Could not find II. in the first page")
34 | 
35 |     section_header_block = first_page.contained_blocks(
36 |         pdf_document, (BlockTypes.SectionHeader,)
37 |     )[0]
38 |     assert section_header_block.raw_text(pdf_document) == "II. THEORETICAL FRAMEWORK\n"
39 | 
40 |     assert first_page.refs[0].ref == "page-1-0"
41 | 
42 |     markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
43 |     markdown = markdown_output.markdown
44 | 
45 |     assert "[II.](#page-1-0)" in markdown
46 |     assert '<span id="page-1-0"></span>II. THEORETICAL FRAMEWORK' in markdown
47 | 
48 |     for ref in set(
49 |         [
50 |             f'<span id="page-{m[0]}-{m[1]}">'
51 |             for m in re.findall(r"\]\(#page-(\d+)-(\d+)\)", markdown)
52 |         ]
53 |     ):
54 |         assert ref in markdown, f"Reference {ref} not found in markdown"
55 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/download/llamaparse.py:
--------------------------------------------------------------------------------

```python
 1 | import io
 2 | import time
 3 | 
 4 | import requests
 5 | 
 6 | from benchmarks.overall.download.base import Downloader
 7 | 
 8 | 
 9 | class LlamaParseDownloader(Downloader):
10 |     service = "llamaparse"
11 | 
12 |     def get_html(self, pdf_bytes):
13 |         rand_name = str(time.time()) + ".pdf"
14 |         start = time.time()
15 |         buff = io.BytesIO(pdf_bytes)
16 |         md = upload_and_parse_file(self.api_key, rand_name, buff)
17 |         end = time.time()
18 |         if isinstance(md, bytes):
19 |             md = md.decode("utf-8")
20 | 
21 |         return {
22 |             "md": md,
23 |             "time": end - start,
24 |         }
25 | 
26 | 
27 | def upload_and_parse_file(api_key: str, fname: str, buff, max_retries: int = 180, delay: int = 1):
28 |     headers = {
29 |         "Authorization": f"Bearer {api_key}",
30 |         "Accept": "application/json"
31 |     }
32 | 
33 |     # Upload file
34 |     files = {
35 |         'file': (fname, buff, 'application/pdf')
36 |     }
37 |     response = requests.post(
38 |         'https://api.cloud.llamaindex.ai/api/v1/parsing/upload',
39 |         headers=headers,
40 |         files=files
41 |     )
42 |     response.raise_for_status()
43 |     job_id = response.json()['id']
44 | 
45 |     # Poll for completion
46 |     for _ in range(max_retries):
47 |         status_response = requests.get(
48 |             f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}',
49 |             headers=headers
50 |         )
51 |         status_response.raise_for_status()
52 |         if status_response.json()['status'] == 'SUCCESS':
53 |             # Get results
54 |             result_response = requests.get(
55 |                 f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown',
56 |                 headers=headers
57 |             )
58 |             result_response.raise_for_status()
59 |             return result_response.json()['markdown']
60 | 
61 |         time.sleep(delay)
62 | 
63 |     raise TimeoutError("Job did not complete within the maximum retry attempts")
```

--------------------------------------------------------------------------------
/benchmarks/overall/download/mistral.py:
--------------------------------------------------------------------------------

```python
 1 | import io
 2 | import time
 3 | import requests
 4 | 
 5 | from benchmarks.overall.download.base import Downloader
 6 | 
 7 | 
 8 | class MistralDownloader(Downloader):
 9 |     service = "mistral"
10 | 
11 |     def get_html(self, pdf_bytes):
12 |         rand_name = str(time.time()) + ".pdf"
13 |         start = time.time()
14 |         buff = io.BytesIO(pdf_bytes)
15 |         md = upload_and_process_file(self.api_key, rand_name, buff)
16 |         end = time.time()
17 |         if isinstance(md, bytes):
18 |             md = md.decode("utf-8")
19 | 
20 |         return {
21 |             "md": md,
22 |             "time": end - start,
23 |         }
24 | 
25 | 
26 | def upload_and_process_file(api_key: str, fname: str, buff):
27 |     headers = {
28 |         "Authorization": f"Bearer {api_key}"
29 |     }
30 | 
31 |     upload_headers = headers.copy()
32 |     files = {
33 |         'file': (fname, buff, 'application/pdf'),
34 |         'purpose': (None, 'ocr')
35 |     }
36 | 
37 |     upload_response = requests.post(
38 |         'https://api.mistral.ai/v1/files',
39 |         headers=upload_headers,
40 |         files=files
41 |     )
42 |     upload_response.raise_for_status()
43 |     file_id = upload_response.json()['id']
44 | 
45 |     url_headers = headers.copy()
46 |     url_headers["Accept"] = "application/json"
47 | 
48 |     url_response = requests.get(
49 |         f'https://api.mistral.ai/v1/files/{file_id}/url?expiry=24',
50 |         headers=url_headers
51 |     )
52 |     url_response.raise_for_status()
53 |     signed_url = url_response.json()['url']
54 | 
55 |     ocr_headers = headers.copy()
56 |     ocr_headers["Content-Type"] = "application/json"
57 | 
58 |     ocr_data = {
59 |         "model": "mistral-ocr-latest",
60 |         "document": {
61 |             "type": "document_url",
62 |             "document_url": signed_url
63 |         },
64 |         "include_image_base64": True
65 |     }
66 |     ocr_response = requests.post(
67 |         'https://api.mistral.ai/v1/ocr',
68 |         headers=ocr_headers,
69 |         json=ocr_data
70 |     )
71 |     ocr_response.raise_for_status()
72 |     result = ocr_response.json()
73 |     return result["pages"][0]["markdown"]
```

--------------------------------------------------------------------------------
/tests/config/test_config.py:
--------------------------------------------------------------------------------

```python
 1 | import sys
 2 | from contextlib import suppress
 3 | import click
 4 | 
 5 | from marker.config.printer import CustomClickPrinter
 6 | from marker.config.crawler import crawler
 7 | from marker.config.parser import ConfigParser
 8 | 
 9 | 
10 | def capture_kwargs(argv):
11 |     command = click.command(cls=CustomClickPrinter)
12 |     captured_kwargs = {}
13 | 
14 |     def parse_args(**kwargs):
15 |         captured_kwargs.update(kwargs)
16 |         return kwargs
17 | 
18 |     original_argv = sys.argv
19 |     sys.argv = argv
20 |     try:
21 |         with suppress(SystemExit):
22 |             command(ConfigParser.common_options(parse_args))()
23 |     finally:
24 |         sys.argv = original_argv
25 | 
26 |     return captured_kwargs
27 | 
28 | 
29 | def test_config_parser():
30 |     sys.argv = [
31 |         "test",
32 |         "--disable_multiprocessing",
33 |         "--output_dir",
34 |         "output_dir",
35 |         "--height_tolerance",
36 |         "0.5",
37 |     ]
38 |     kwargs = capture_kwargs(sys.argv)
39 |     parser = ConfigParser(kwargs)
40 |     config_dict = parser.generate_config_dict()
41 | 
42 |     # Validate kwarg capturing
43 |     assert kwargs["disable_multiprocessing"]
44 |     assert kwargs["output_dir"] == "output_dir"
45 | 
46 |     assert config_dict["pdftext_workers"] == 1  # disabling multiprocessing does this
47 |     assert config_dict["height_tolerance"] == 0.5
48 | 
49 | 
50 | def test_config_none():
51 |     kwargs = capture_kwargs(["test"])
52 | 
53 |     for key in crawler.attr_set:
54 |         # We force some options to become flags for ease of use on the CLI
55 |         value = None
56 |         assert kwargs.get(key) is value
57 | 
58 | 
59 | def test_config_llm():
60 |     kwargs = capture_kwargs(["test", "--use_llm"])
61 |     parser = ConfigParser(kwargs)
62 |     config_dict = parser.generate_config_dict()
63 | 
64 |     # Validate kwarg capturing
65 |     assert config_dict["use_llm"]
66 | 
67 | 
68 | def test_config_force_ocr():
69 |     kwargs = capture_kwargs(["test", "--force_ocr"])
70 |     parser = ConfigParser(kwargs)
71 |     config_dict = parser.generate_config_dict()
72 | 
73 |     # Validate kwarg capturing
74 |     assert config_dict["force_ocr"]
75 | 
```

--------------------------------------------------------------------------------
/tests/converters/test_ocr_converter.py:
--------------------------------------------------------------------------------

```python
 1 | import pytest
 2 | 
 3 | from marker.converters.ocr import OCRConverter
 4 | from marker.renderers.ocr_json import OCRJSONOutput, OCRJSONPageOutput
 5 | 
 6 | 
 7 | def _ocr_converter(config, model_dict, temp_pdf, line_count: int, eq_count: int):
 8 |     converter = OCRConverter(artifact_dict=model_dict, config=config)
 9 | 
10 |     ocr_json: OCRJSONOutput = converter(temp_pdf.name)
11 |     pages = ocr_json.children
12 | 
13 |     assert len(pages) == 1
14 |     # assert len(pages[0].children) == line_count
15 |     eqs = [line for line in pages[0].children if line.block_type == "Equation"]
16 |     assert len(eqs) == eq_count
17 |     return pages
18 | 
19 | 
20 | def check_bboxes(page: OCRJSONPageOutput, lines):
21 |     page_size = page.bbox
22 |     for line in lines:
23 |         assert len(line.children) > 0
24 |         for child in line.children:
25 |             bbox = child.bbox
26 |             assert all(
27 |                 [
28 |                     bbox[0] >= page_size[0],
29 |                     bbox[1] >= page_size[1],
30 |                     bbox[2] <= page_size[2],
31 |                     bbox[3] <= page_size[3],
32 |                 ]
33 |             ), "Child bbox is outside page bbox"
34 | 
35 | 
36 | @pytest.mark.config({"page_range": [0]})
37 | def test_ocr_converter(config, model_dict, temp_doc):
38 |     _ocr_converter(config, model_dict, temp_doc, 85, 2)
39 | 
40 | 
41 | @pytest.mark.filename("pres.pdf")
42 | @pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True})
43 | def test_ocr_converter_force(config, model_dict, temp_doc):
44 |     pages = _ocr_converter(config, model_dict, temp_doc, 10, 0)
45 |     lines = [line for line in pages[0].children if line.block_type == "Line"]
46 |     check_bboxes(pages[0], lines)
47 | 
48 | 
49 | @pytest.mark.filename("pres.pdf")
50 | @pytest.mark.config({"page_range": [1], "keep_chars": True})
51 | def test_ocr_converter_keep(config, model_dict, temp_doc):
52 |     pages = _ocr_converter(config, model_dict, temp_doc, 10, 0)
53 |     lines = [line for line in pages[0].children if line.block_type == "Line"]
54 |     check_bboxes(pages[0], lines)
55 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/display/dataset.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | from typing import List
 3 | 
 4 | import datasets
 5 | from tqdm import tqdm
 6 | 
 7 | from benchmarks.overall.registry import METHOD_REGISTRY
 8 | from benchmarks.overall.schema import FullResult
 9 | 
10 | 
11 | def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, score_types: List[str], max_rows: int | None = None) -> datasets.Dataset:
12 |     rows = []
13 |     for idx, sample in tqdm(enumerate(bench_dataset), desc="Building dataset"):
14 |         if idx not in result["markdown"]:
15 |             continue
16 | 
17 |         if max_rows is not None and idx >= max_rows:
18 |             break
19 | 
20 |         row = {
21 |             "uuid": sample["uuid"],
22 |             "classification": sample["classification"],
23 |             "language": sample["language"],
24 |             "img": sample["img"],
25 |         }
26 |         for method in result["markdown"][idx]:
27 |             if method == "gt":
28 |                 continue
29 | 
30 |             method_cls = METHOD_REGISTRY[method]()
31 |             md = result["markdown"][idx][method]
32 |             try:
33 |                 method_img = method_cls.render(result["markdown"][idx][method])
34 |             except Exception as e:
35 |                 # This can happen when the markdown is None
36 |                 method_img = PIL.Image.new("RGB", (200, 200))
37 | 
38 |             row[f"{method}_md"] = md
39 |             row[f"{method}_img"] = method_img
40 | 
41 |             for score_type in score_types:
42 |                 try:
43 |                     row[f"{method}_{score_type}"] = result["scores"][idx][method][score_type]["score"]
44 |                 except KeyError:
45 |                     row[f"{method}_{score_type}"] = -1.0 # Missing score
46 |                 try:
47 |                     row[f"{method}_{score_type}_detail"] = json.dumps(result["scores"][idx][method][score_type]["specific_scores"])
48 |                 except KeyError:
49 |                     row[f"{method}_{score_type}_detail"] = "" # Missing detail
50 |         rows.append(row)
51 |     ds = datasets.Dataset.from_list(rows)
52 |     return ds
53 | 
54 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/download/base.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | from json import JSONDecodeError
 3 | from pathlib import Path
 4 | 
 5 | import datasets
 6 | from tqdm import tqdm
 7 | 
 8 | 
 9 | class Downloader:
10 |     cache_path: Path = Path("cache")
11 |     service: str
12 | 
13 |     def __init__(self, api_key, app_id, max_rows: int = 2200):
14 |         self.cache_path.mkdir(exist_ok=True)
15 |         self.max_rows = max_rows
16 |         self.api_key = api_key
17 |         self.app_id = app_id
18 |         self.ds = datasets.load_dataset("datalab-to/marker_benchmark", split="train")
19 | 
20 |     def get_html(self, pdf_bytes):
21 |         raise NotImplementedError
22 | 
23 |     def upload_ds(self):
24 |         rows = []
25 |         for file in self.cache_path.glob("*.json"):
26 |             with open(file, "r") as f:
27 |                 data = json.load(f)
28 |             rows.append(data)
29 | 
30 |         out_ds = datasets.Dataset.from_list(rows, features=datasets.Features({
31 |             "md": datasets.Value("string"),
32 |             "uuid": datasets.Value("string"),
33 |             "time": datasets.Value("float"),
34 |         }))
35 |         out_ds.push_to_hub(f"datalab-to/marker_benchmark_{self.service}", private=True)
36 | 
37 |     def generate_data(self):
38 |         max_rows = self.max_rows
39 |         for idx, sample in tqdm(enumerate(self.ds), desc=f"Saving {self.service} results"):
40 |             cache_file = self.cache_path / f"{idx}.json"
41 |             if cache_file.exists():
42 |                 continue
43 | 
44 |             pdf_bytes = sample["pdf"]  # This is a single page PDF
45 |             try:
46 |                 out_data = self.get_html(pdf_bytes)
47 |             except JSONDecodeError as e:
48 |                 print(f"Error with sample {idx}: {e}")
49 |                 continue
50 |             except Exception as e:
51 |                 print(f"Error with sample {idx}: {e}")
52 |                 continue
53 |             out_data["uuid"] = sample["uuid"]
54 | 
55 |             with cache_file.open("w") as f:
56 |                 json.dump(out_data, f)
57 | 
58 |             if idx >= max_rows:
59 |                 break
60 | 
61 |     def __call__(self):
62 |         self.generate_data()
63 |         self.upload_ds()
64 | 
```

--------------------------------------------------------------------------------
/marker/processors/reference.py:
--------------------------------------------------------------------------------

```python
 1 | import numpy as np
 2 | 
 3 | from marker.processors import BaseProcessor
 4 | from marker.schema import BlockTypes
 5 | from marker.schema.blocks import Reference
 6 | from marker.schema.document import Document
 7 | from marker.schema.groups.list import ListGroup
 8 | from marker.schema.groups.table import TableGroup
 9 | from marker.schema.registry import get_block_class
10 | from marker.schema.groups.figure import FigureGroup
11 | 
12 | 
13 | class ReferenceProcessor(BaseProcessor):
14 |     """
15 |     A processor for adding references to the document.
16 |     """
17 | 
18 |     def __init__(self, config):
19 |         super().__init__(config)
20 | 
21 |     def __call__(self, document: Document):
22 |         ReferenceClass: Reference = get_block_class(BlockTypes.Reference)
23 | 
24 |         for page in document.pages:
25 |             refs = page.refs
26 |             ref_starts = np.array([ref.coord for ref in refs])
27 | 
28 |             blocks = []
29 |             for block_id in page.structure:
30 |                 block = page.get_block(block_id)
31 |                 if isinstance(block, (ListGroup, FigureGroup, TableGroup)):
32 |                     blocks.extend([page.get_block(b) for b in block.structure])
33 |                 else:
34 |                     blocks.append(block)
35 |             blocks = [b for b in blocks if not b.ignore_for_output]
36 | 
37 |             block_starts = np.array([block.polygon.bbox[:2] for block in blocks])
38 | 
39 |             if not (len(refs) and len(block_starts)):
40 |                 continue
41 | 
42 |             distances = np.linalg.norm(block_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2)
43 |             for ref_idx in range(len(ref_starts)):
44 |                 block_idx = np.argmin(distances[:, ref_idx])
45 |                 block = blocks[block_idx]
46 | 
47 |                 ref_block = page.add_full_block(ReferenceClass(
48 |                     ref=refs[ref_idx].ref,
49 |                     polygon=block.polygon,
50 |                     page_id=page.page_id
51 |                 ))
52 |                 if block.structure is None:
53 |                     block.structure = []
54 |                 block.structure.insert(0, ref_block.id)
55 | 
```

--------------------------------------------------------------------------------
/marker/builders/document.py:
--------------------------------------------------------------------------------

```python
 1 | from typing import Annotated
 2 | 
 3 | from marker.builders import BaseBuilder
 4 | from marker.builders.layout import LayoutBuilder
 5 | from marker.builders.line import LineBuilder
 6 | from marker.builders.ocr import OcrBuilder
 7 | from marker.providers.pdf import PdfProvider
 8 | from marker.schema import BlockTypes
 9 | from marker.schema.document import Document
10 | from marker.schema.groups.page import PageGroup
11 | from marker.schema.registry import get_block_class
12 | 
13 | 
14 | class DocumentBuilder(BaseBuilder):
15 |     """
16 |     Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder.
17 |     """
18 |     lowres_image_dpi: Annotated[
19 |         int,
20 |         "DPI setting for low-resolution page images used for Layout and Line Detection.",
21 |     ] = 96
22 |     highres_image_dpi: Annotated[
23 |         int,
24 |         "DPI setting for high-resolution page images used for OCR.",
25 |     ] = 192
26 |     disable_ocr: Annotated[
27 |         bool,
28 |         "Disable OCR processing.",
29 |     ] = False
30 | 
31 |     def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, line_builder: LineBuilder, ocr_builder: OcrBuilder):
32 |         document = self.build_document(provider)
33 |         layout_builder(document, provider)
34 |         line_builder(document, provider)
35 |         if not self.disable_ocr:
36 |             ocr_builder(document, provider)
37 |         return document
38 | 
39 |     def build_document(self, provider: PdfProvider):
40 |         PageGroupClass: PageGroup = get_block_class(BlockTypes.Page)
41 |         lowres_images = provider.get_images(provider.page_range, self.lowres_image_dpi)
42 |         highres_images = provider.get_images(provider.page_range, self.highres_image_dpi)
43 |         initial_pages = [
44 |             PageGroupClass(
45 |                 page_id=p,
46 |                 lowres_image=lowres_images[i],
47 |                 highres_image=highres_images[i],
48 |                 polygon=provider.get_page_bbox(p),
49 |                 refs=provider.get_page_refs(p)
50 |             ) for i, p in enumerate(provider.page_range)
51 |         ]
52 |         DocumentClass: Document = get_block_class(BlockTypes.Document)
53 |         return DocumentClass(filepath=provider.filepath, pages=initial_pages)
54 | 
```

--------------------------------------------------------------------------------
/marker/services/ollama.py:
--------------------------------------------------------------------------------

```python
 1 | import json
 2 | from typing import Annotated, List
 3 | 
 4 | import PIL
 5 | import requests
 6 | from marker.logger import get_logger
 7 | from pydantic import BaseModel
 8 | 
 9 | from marker.schema.blocks import Block
10 | from marker.services import BaseService
11 | 
12 | logger = get_logger()
13 | 
14 | 
15 | class OllamaService(BaseService):
16 |     ollama_base_url: Annotated[
17 |         str, "The base url to use for ollama.  No trailing slash."
18 |     ] = "http://localhost:11434"
19 |     ollama_model: Annotated[str, "The model name to use for ollama."] = (
20 |         "llama3.2-vision"
21 |     )
22 | 
23 |     def process_images(self, images):
24 |         image_bytes = [self.img_to_base64(img) for img in images]
25 |         return image_bytes
26 | 
27 |     def __call__(
28 |         self,
29 |         prompt: str,
30 |         image: PIL.Image.Image | List[PIL.Image.Image] | None,
31 |         block: Block | None,
32 |         response_schema: type[BaseModel],
33 |         max_retries: int | None = None,
34 |         timeout: int | None = None,
35 |     ):
36 |         url = f"{self.ollama_base_url}/api/generate"
37 |         headers = {"Content-Type": "application/json"}
38 | 
39 |         schema = response_schema.model_json_schema()
40 |         format_schema = {
41 |             "type": "object",
42 |             "properties": schema["properties"],
43 |             "required": schema["required"],
44 |         }
45 | 
46 |         image_bytes = self.format_image_for_llm(image)
47 | 
48 |         payload = {
49 |             "model": self.ollama_model,
50 |             "prompt": prompt,
51 |             "stream": False,
52 |             "format": format_schema,
53 |             "images": image_bytes,
54 |         }
55 | 
56 |         try:
57 |             response = requests.post(url, json=payload, headers=headers)
58 |             response.raise_for_status()
59 |             response_data = response.json()
60 | 
61 |             total_tokens = (
62 |                 response_data["prompt_eval_count"] + response_data["eval_count"]
63 |             )
64 | 
65 |             if block:
66 |                 block.update_metadata(llm_request_count=1, llm_tokens_used=total_tokens)
67 | 
68 |             data = response_data["response"]
69 |             return json.loads(data)
70 |         except Exception as e:
71 |             logger.warning(f"Ollama inference failed: {e}")
72 | 
73 |         return {}
74 | 
```