This is page 1 of 9. Use http://codebase.md/datalab-to/marker?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ ├── ISSUE_TEMPLATE │ │ ├── breaking-bug-report.md │ │ ├── feature_request.md │ │ └── output-bug-report.md │ └── workflows │ ├── benchmarks.yml │ ├── ci.yml │ ├── cla.yml │ ├── publish.yml │ └── scripts.yml ├── .gitignore ├── .pre-commit-config.yaml ├── benchmarks │ ├── __init__.py │ ├── overall │ │ ├── __init__.py │ │ ├── display │ │ │ ├── __init__.py │ │ │ ├── dataset.py │ │ │ └── table.py │ │ ├── download │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── llamaparse.py │ │ │ ├── main.py │ │ │ ├── mathpix.py │ │ │ └── mistral.py │ │ ├── elo.py │ │ ├── methods │ │ │ ├── __init__.py │ │ │ ├── docling.py │ │ │ ├── gt.py │ │ │ ├── llamaparse.py │ │ │ ├── marker.py │ │ │ ├── mathpix.py │ │ │ ├── mistral.py │ │ │ ├── olmocr.py │ │ │ └── schema.py │ │ ├── overall.py │ │ ├── registry.py │ │ ├── schema.py │ │ └── scorers │ │ ├── __init__.py │ │ ├── clean.py │ │ ├── heuristic.py │ │ ├── llm.py │ │ └── schema.py │ ├── table │ │ ├── __init__.py │ │ ├── gemini.py │ │ ├── inference.py │ │ ├── scoring.py │ │ └── table.py │ ├── throughput │ │ ├── __init__.py │ │ └── main.py │ └── verify_scores.py ├── chunk_convert.py ├── CLA.md ├── convert_single.py ├── convert.py ├── data │ ├── .gitignore │ ├── examples │ │ ├── json │ │ │ ├── multicolcnn.json │ │ │ ├── switch_trans.json │ │ │ └── thinkpython.json │ │ └── markdown │ │ ├── multicolcnn │ │ │ ├── _page_1_Figure_0.jpeg │ │ │ ├── _page_2_Picture_0.jpeg │ │ │ ├── _page_6_Figure_0.jpeg │ │ │ ├── _page_7_Figure_0.jpeg │ │ │ ├── multicolcnn_meta.json │ │ │ └── multicolcnn.md │ │ ├── switch_transformers │ │ │ ├── _page_11_Figure_4.jpeg │ │ │ ├── _page_12_Figure_4.jpeg │ │ │ ├── _page_13_Figure_2.jpeg │ │ │ ├── _page_18_Figure_1.jpeg │ │ │ ├── _page_18_Figure_3.jpeg │ │ │ ├── _page_2_Figure_3.jpeg │ │ │ ├── _page_20_Figure_1.jpeg │ │ │ ├── _page_20_Figure_4.jpeg │ │ │ ├── _page_27_Figure_1.jpeg │ │ │ ├── _page_29_Figure_1.jpeg │ │ │ ├── _page_30_Figure_1.jpeg │ │ │ ├── _page_31_Figure_3.jpeg │ │ │ ├── _page_4_Figure_1.jpeg │ │ │ ├── _page_5_Figure_3.jpeg │ │ │ ├── switch_trans_meta.json │ │ │ └── switch_trans.md │ │ └── thinkpython │ │ ├── _page_109_Figure_1.jpeg │ │ ├── _page_115_Figure_1.jpeg │ │ ├── _page_116_Figure_3.jpeg │ │ ├── _page_127_Figure_1.jpeg │ │ ├── _page_128_Figure_1.jpeg │ │ ├── _page_167_Figure_1.jpeg │ │ ├── _page_169_Figure_1.jpeg │ │ ├── _page_173_Figure_1.jpeg │ │ ├── _page_190_Figure_1.jpeg │ │ ├── _page_195_Figure_1.jpeg │ │ ├── _page_205_Figure_1.jpeg │ │ ├── _page_23_Figure_1.jpeg │ │ ├── _page_23_Figure_3.jpeg │ │ ├── _page_230_Figure_1.jpeg │ │ ├── _page_233_Figure_1.jpeg │ │ ├── _page_233_Figure_3.jpeg │ │ ├── _page_234_Figure_1.jpeg │ │ ├── _page_235_Figure_1.jpeg │ │ ├── _page_236_Figure_1.jpeg │ │ ├── _page_236_Figure_3.jpeg │ │ ├── _page_237_Figure_1.jpeg │ │ ├── _page_238_Figure_1.jpeg │ │ ├── _page_46_Figure_1.jpeg │ │ ├── _page_60_Figure_1.jpeg │ │ ├── _page_60_Figure_3.jpeg │ │ ├── _page_67_Figure_1.jpeg │ │ ├── _page_71_Figure_1.jpeg │ │ ├── _page_78_Figure_1.jpeg │ │ ├── _page_85_Figure_1.jpeg │ │ ├── _page_94_Figure_1.jpeg │ │ ├── _page_99_Figure_17.jpeg │ │ ├── _page_99_Figure_178.jpeg │ │ ├── thinkpython_meta.json │ │ └── thinkpython.md │ ├── images │ │ ├── overall.png │ │ ├── per_doc.png │ │ └── table.png │ └── latex_to_md.sh ├── examples │ ├── marker_modal_deployment.py │ └── README.md ├── extraction_app.py ├── LICENSE ├── marker │ ├── builders │ │ ├── __init__.py │ │ ├── document.py │ │ ├── layout.py │ │ ├── line.py │ │ ├── ocr.py │ │ └── structure.py │ ├── config │ │ ├── __init__.py │ │ ├── crawler.py │ │ ├── parser.py │ │ └── printer.py │ ├── converters │ │ ├── __init__.py │ │ ├── extraction.py │ │ ├── ocr.py │ │ ├── pdf.py │ │ └── table.py │ ├── extractors │ │ ├── __init__.py │ │ ├── document.py │ │ └── page.py │ ├── logger.py │ ├── models.py │ ├── output.py │ ├── processors │ │ ├── __init__.py │ │ ├── blank_page.py │ │ ├── block_relabel.py │ │ ├── blockquote.py │ │ ├── code.py │ │ ├── debug.py │ │ ├── document_toc.py │ │ ├── equation.py │ │ ├── footnote.py │ │ ├── ignoretext.py │ │ ├── line_merge.py │ │ ├── line_numbers.py │ │ ├── list.py │ │ ├── llm │ │ │ ├── __init__.py │ │ │ ├── llm_complex.py │ │ │ ├── llm_equation.py │ │ │ ├── llm_form.py │ │ │ ├── llm_handwriting.py │ │ │ ├── llm_image_description.py │ │ │ ├── llm_mathblock.py │ │ │ ├── llm_meta.py │ │ │ ├── llm_page_correction.py │ │ │ ├── llm_sectionheader.py │ │ │ ├── llm_table_merge.py │ │ │ └── llm_table.py │ │ ├── order.py │ │ ├── page_header.py │ │ ├── reference.py │ │ ├── sectionheader.py │ │ ├── table.py │ │ ├── text.py │ │ └── util.py │ ├── providers │ │ ├── __init__.py │ │ ├── document.py │ │ ├── epub.py │ │ ├── html.py │ │ ├── image.py │ │ ├── pdf.py │ │ ├── powerpoint.py │ │ ├── registry.py │ │ ├── spreadsheet.py │ │ └── utils.py │ ├── renderers │ │ ├── __init__.py │ │ ├── chunk.py │ │ ├── extraction.py │ │ ├── html.py │ │ ├── json.py │ │ ├── markdown.py │ │ └── ocr_json.py │ ├── schema │ │ ├── __init__.py │ │ ├── blocks │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── basetable.py │ │ │ ├── caption.py │ │ │ ├── code.py │ │ │ ├── complexregion.py │ │ │ ├── equation.py │ │ │ ├── figure.py │ │ │ ├── footnote.py │ │ │ ├── form.py │ │ │ ├── handwriting.py │ │ │ ├── inlinemath.py │ │ │ ├── listitem.py │ │ │ ├── pagefooter.py │ │ │ ├── pageheader.py │ │ │ ├── picture.py │ │ │ ├── reference.py │ │ │ ├── sectionheader.py │ │ │ ├── table.py │ │ │ ├── tablecell.py │ │ │ ├── text.py │ │ │ └── toc.py │ │ ├── document.py │ │ ├── groups │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── figure.py │ │ │ ├── list.py │ │ │ ├── page.py │ │ │ ├── picture.py │ │ │ └── table.py │ │ ├── polygon.py │ │ ├── registry.py │ │ └── text │ │ ├── __init__.py │ │ ├── char.py │ │ ├── line.py │ │ └── span.py │ ├── scripts │ │ ├── __init__.py │ │ ├── chunk_convert.py │ │ ├── chunk_convert.sh │ │ ├── common.py │ │ ├── convert_single.py │ │ ├── convert.py │ │ ├── extraction_app.py │ │ ├── file_to_s3.py │ │ ├── run_streamlit_app.py │ │ ├── server.py │ │ └── streamlit_app.py │ ├── services │ │ ├── __init__.py │ │ ├── azure_openai.py │ │ ├── claude.py │ │ ├── gemini.py │ │ ├── ollama.py │ │ ├── openai.py │ │ └── vertex.py │ ├── settings.py │ ├── util.py │ └── utils │ ├── __init__.py │ ├── batch.py │ ├── gpu.py │ └── image.py ├── marker_app.py ├── marker_server.py ├── poetry.lock ├── pyproject.toml ├── pytest.ini ├── README.md ├── signatures │ └── version1 │ └── cla.json ├── static │ └── fonts │ └── .gitignore └── tests ├── builders │ ├── test_blank_page.py │ ├── test_document_builder.py │ ├── test_garbled_pdf.py │ ├── test_layout_replace.py │ ├── test_ocr_builder.py │ ├── test_ocr_pipeline.py │ ├── test_overriding.py │ ├── test_pdf_links.py │ ├── test_rotated_bboxes.py │ ├── test_strip_existing_ocr.py │ └── test_structure.py ├── config │ └── test_config.py ├── conftest.py ├── converters │ ├── test_extraction_converter.py │ ├── test_ocr_converter.py │ ├── test_pdf_converter.py │ └── test_table_converter.py ├── processors │ ├── test_document_toc_processor.py │ ├── test_equation_processor.py │ ├── test_footnote_processor.py │ ├── test_ignoretext.py │ ├── test_llm_processors.py │ ├── test_table_merge.py │ └── test_table_processor.py ├── providers │ ├── test_document_providers.py │ ├── test_image_provider.py │ └── test_pdf_provider.py ├── renderers │ ├── test_chunk_renderer.py │ ├── test_extract_images.py │ ├── test_html_renderer.py │ ├── test_json_renderer.py │ └── test_markdown_renderer.py ├── schema │ └── groups │ └── test_list_grouping.py ├── services │ └── test_service_init.py └── utils.py ``` # Files -------------------------------------------------------------------------------- /static/fonts/.gitignore: -------------------------------------------------------------------------------- ``` 1 | * 2 | !.gitignore ``` -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- ``` 1 | latex 2 | pdfs 3 | references ``` -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- ```yaml 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | # Ruff version. 4 | rev: v0.9.10 5 | hooks: 6 | # Run the linter. 7 | - id: ruff 8 | types_or: [ python, pyi ] 9 | args: [ --fix ] 10 | # Run the formatter. 11 | - id: ruff-format 12 | types_or: [ python, pyi ] ``` -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- ``` 1 | private.py 2 | .DS_Store 3 | local.env 4 | experiments 5 | test_data 6 | training 7 | wandb 8 | *.dat 9 | report.json 10 | benchmark_data 11 | debug_data 12 | temp.md 13 | temp 14 | conversion_results 15 | uploads 16 | /cache 17 | 18 | # Byte-compiled / optimized / DLL files 19 | __pycache__/ 20 | *.py[cod] 21 | *$py.class 22 | 23 | # C extensions 24 | *.so 25 | 26 | # Distribution / packaging 27 | .Python 28 | build/ 29 | develop-eggs/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | .eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | wheels/ 40 | share/python-wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | MANIFEST 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | cover/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | local_settings.py 78 | db.sqlite3 79 | db.sqlite3-journal 80 | 81 | # Flask stuff: 82 | instance/ 83 | .webassets-cache 84 | 85 | # Scrapy stuff: 86 | .scrapy 87 | 88 | # Sphinx documentation 89 | docs/_build/ 90 | 91 | # PyBuilder 92 | .pybuilder/ 93 | target/ 94 | 95 | # Jupyter Notebook 96 | .ipynb_checkpoints 97 | 98 | # IPython 99 | profile_default/ 100 | ipython_config.py 101 | 102 | # pyenv 103 | # For a library or package, you might want to ignore these files since the code is 104 | # intended to run in multiple environments; otherwise, check them in: 105 | # .python-version 106 | 107 | # pipenv 108 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 109 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 110 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 111 | # install all needed dependencies. 112 | #Pipfile.lock 113 | 114 | # poetry 115 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 116 | # This is especially recommended for binary packages to ensure reproducibility, and is more 117 | # commonly ignored for libraries. 118 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 119 | #poetry.lock 120 | 121 | # pdm 122 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 123 | #pdm.lock 124 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 125 | # in version control. 126 | # https://pdm.fming.dev/#use-with-ide 127 | .pdm.toml 128 | 129 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 130 | __pypackages__/ 131 | 132 | # Celery stuff 133 | celerybeat-schedule 134 | celerybeat.pid 135 | 136 | # SageMath parsed files 137 | *.sage.py 138 | 139 | # Environments 140 | .env 141 | .venv 142 | env/ 143 | venv/ 144 | ENV/ 145 | env.bak/ 146 | venv.bak/ 147 | 148 | # Spyder project settings 149 | .spyderproject 150 | .spyproject 151 | 152 | # Rope project settings 153 | .ropeproject 154 | 155 | # mkdocs documentation 156 | /site 157 | 158 | # mypy 159 | .mypy_cache/ 160 | .dmypy.json 161 | dmypy.json 162 | 163 | # Pyre type checker 164 | .pyre/ 165 | 166 | # pytype static type analyzer 167 | .pytype/ 168 | 169 | # Cython debug symbols 170 | cython_debug/ 171 | 172 | # PyCharm 173 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 174 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 175 | # and can be added to the global gitignore or merged into this file. For a more nuclear 176 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 177 | .idea/ 178 | 179 | .vscode/ ``` -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- ```markdown 1 | ## Usage Examples 2 | 3 | This directory contains examples of running `marker` in different contexts. 4 | 5 | ### Usage with Modal 6 | 7 | We have a [self-contained example](./marker_modal_deployment.py) that shows how you can quickly use [Modal](https://modal.com) to deploy `marker` by provisioning a container with a GPU, and expose that with an API so you can submit PDFs for conversion into Markdown, HTML, or JSON. 8 | 9 | It's a limited example that you can extend into different use cases. 10 | 11 | #### Pre-requisites 12 | 13 | Make sure you have the `modal` client installed by [following their instructions here](https://modal.com/docs/guide#getting-started). 14 | 15 | Modal's [Starter Plan](https://modal.com/pricing) includes $30 of free compute each month. 16 | Modal is [serverless](https://arxiv.org/abs/1902.03383), so you only pay for resources when you are using them. 17 | 18 | #### Running the example 19 | 20 | Once `modal` is configured, you can deploy it to your workspace by running: 21 | 22 | > modal deploy marker_modal_deployment.py 23 | 24 | Notes: 25 | - `marker` has a few models it uses. By default, the endpoint will check if these models are loaded and download them if not (first request will be slow). You can avoid this by running 26 | 27 | > modal run marker_modal_deployment.py::download_models 28 | 29 | Which will create a [`Modal Volume`](https://modal.com/docs/guide/Volumes) to store them for re-use. 30 | 31 | Once the deploy is finished, you can: 32 | - Test a file upload locally through your CLI using an `invoke_conversion` command we expose through Modal's [`local_entrypoint`](https://modal.com/docs/reference/modal.App#local_entrypoint) 33 | - Get the URL of your endpoint and make a request through a client of your choice. 34 | 35 | **Test from your CLI with `invoke_conversion`** 36 | 37 | If your endpoint is live, simply run this command: 38 | 39 | ``` 40 | $ modal run marker_modal_deployment.py::invoke_conversion --pdf-file <PDF_FILE_PATH> --output-format markdown 41 | ``` 42 | 43 | And it'll automatically detect the URL of your new endpoint using [`.get_web_url()`](https://modal.com/docs/guide/webhook-urls#determine-the-url-of-a-web-endpoint-from-code), make sure it's healthy, submit your file, and store its output on your machine (in the same directory). 44 | 45 | **Making a request using your own client** 46 | 47 | If you want to make requests elsewhere e.g. with cURL or a client like Insomnia, you'll need to get the URL. 48 | 49 | When your `modal deploy` command from earlier finishes, it'll include your endpoint URL at the end. For example: 50 | 51 | ``` 52 | $ modal deploy marker_modal_deployment.py 53 | ... 54 | ✓ Created objects. 55 | ├── 🔨 Created mount /marker/examples/marker_modal_deployment.py 56 | ├── 🔨 Created function download_models. 57 | ├── 🔨 Created function MarkerModalDemoService.*. 58 | └── 🔨 Created web endpoint for MarkerModalDemoService.fastapi_app => <YOUR_ENDPOINT_URL> 59 | ✓ App deployed in 149.877s! 🎉 60 | ``` 61 | 62 | If you accidentally close your terminal session, you can also always go into Modal's dashboard and: 63 | - Find the app (default name: `datalab-marker-modal-demo`) 64 | - Click on `MarkerModalDemoService` 65 | - Find your endpoint URL 66 | 67 | Once you have your URL, make a request to `{YOUR_ENDPOINT_URL}/convert` like this (you can also use Insomnia, etc.): 68 | ``` 69 | curl --request POST \ 70 | --url {BASE_URL}/convert \ 71 | --header 'Content-Type: multipart/form-data' \ 72 | --form file=@/Users/cooldev/sample.pdf \ 73 | --form output_format=html 74 | ``` 75 | 76 | You should get a response like this 77 | 78 | ``` 79 | { 80 | "success": true, 81 | "filename": "sample.pdf", 82 | "output_format": "html", 83 | "json": null, 84 | "html": "<YOUR_RESPONSE_CONTENT>", 85 | "markdown": null, 86 | "images": {}, 87 | "metadata": {... page level metadata ...}, 88 | "page_count": 2 89 | } 90 | ``` 91 | 92 | [Modal](https://modal.com) makes deploying and scaling models and inference workloads much easier. 93 | 94 | If you're interested in Datalab's managed API or on-prem document intelligence solution, check out [our platform here](https://datalab.to/?utm_source=gh-marker). 95 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- ```markdown 1 | # Marker 2 | 3 | Marker converts documents to markdown, JSON, chunks, and HTML quickly and accurately. 4 | 5 | - Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages 6 | - Formats tables, forms, equations, inline math, links, references, and code blocks 7 | - Extracts and saves images 8 | - Removes headers/footers/other artifacts 9 | - Extensible with your own formatting and logic 10 | - Does structured extraction, given a JSON schema (beta) 11 | - Optionally boost accuracy with LLMs (and your own prompt) 12 | - Works on GPU, CPU, or MPS 13 | 14 | For our managed API or on-prem document intelligence solution, check out [our platform here](https://datalab.to?utm_source=gh-marker). 15 | 16 | ## Performance 17 | 18 | <img src="data/images/overall.png" width="800px"/> 19 | 20 | Marker benchmarks favorably compared to cloud services like Llamaparse and Mathpix, as well as other open source tools. 21 | 22 | The above results are running single PDF pages serially. Marker is significantly faster when running in batch mode, with a projected throughput of 25 pages/second on an H100. 23 | 24 | See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks. 25 | 26 | ## Hybrid Mode 27 | 28 | For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker. This will do things like merge tables across pages, handle inline math, format tables properly, and extract values from forms. It can use any gemini or ollama model. By default, it uses `gemini-2.0-flash`. See [below](#llm-services) for details. 29 | 30 | Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm: 31 | 32 | <img src="data/images/table.png" width="400px"/> 33 | 34 | As you can see, the use_llm mode offers higher accuracy than marker or gemini alone. 35 | 36 | ## Examples 37 | 38 | | PDF | File type | Markdown | JSON | 39 | |-----|-----------|------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------| 40 | | [Think Python](https://greenteapress.com/thinkpython/thinkpython.pdf) | Textbook | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/thinkpython/thinkpython.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/thinkpython.json) | 41 | | [Switch Transformers](https://arxiv.org/pdf/2101.03961.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/switch_transformers/switch_trans.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/switch_trans.json) | 42 | | [Multi-column CNN](https://arxiv.org/pdf/1804.07821.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/multicolcnn/multicolcnn.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/multicolcnn.json) | 43 | 44 | # Commercial usage 45 | 46 | Our model weights use a modified AI Pubs Open Rail-M license (free for research, personal use, and startups under $2M funding/revenue) and our code is GPL. For broader commercial licensing or to remove GPL requirements, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-marker). 47 | 48 | # Hosted API & On-prem 49 | 50 | There's a [hosted API](https://www.datalab.to?utm_source=gh-marker) and [painless on-prem solution](https://www.datalab.to/blog/self-serve-on-prem-licensing) for marker - it's free to sign up, and we'll throw in credits for you to test it out. 51 | 52 | The API: 53 | - Supports PDF, image, PPT, PPTX, DOC, DOCX, XLS, XLSX, HTML, EPUB files 54 | - Is 1/4th the price of leading cloud-based competitors 55 | - Fast - ~15s for a 250 page PDF 56 | - Supports LLM mode 57 | - High uptime (99.99%) 58 | 59 | # Community 60 | 61 | [Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development. 62 | 63 | # Installation 64 | 65 | You'll need python 3.10+ and [PyTorch](https://pytorch.org/get-started/locally/). 66 | 67 | Install with: 68 | 69 | ```shell 70 | pip install marker-pdf 71 | ``` 72 | 73 | If you want to use marker on documents other than PDFs, you will need to install additional dependencies with: 74 | 75 | ```shell 76 | pip install marker-pdf[full] 77 | ``` 78 | 79 | # Usage 80 | 81 | First, some configuration: 82 | 83 | - Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`. 84 | - Some PDFs, even digital ones, have bad text in them. Set `--force_ocr` to force OCR on all lines, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text. 85 | - If you care about inline math, set `force_ocr` to convert inline math to LaTeX. 86 | 87 | ## Interactive App 88 | 89 | I've included a streamlit app that lets you interactively try marker with some basic options. Run it with: 90 | 91 | ```shell 92 | pip install streamlit streamlit-ace 93 | marker_gui 94 | ``` 95 | 96 | ## Convert a single file 97 | 98 | ```shell 99 | marker_single /path/to/file.pdf 100 | ``` 101 | 102 | You can pass in PDFs or images. 103 | 104 | Options: 105 | - `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20. 106 | - `--output_format [markdown|json|html|chunks]`: Specify the format for the output results. 107 | - `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR. 108 | - `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n` 109 | - `--use_llm`: Uses an LLM to improve accuracy. You will need to configure the LLM backend - see [below](#llm-services). 110 | - `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text. This will also format inline math properly. 111 | - `--block_correction_prompt`: if LLM mode is active, an optional prompt that will be used to correct the output of marker. This is useful for custom formatting or logic that you want to apply to the output. 112 | - `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya. 113 | - `--redo_inline_math`: If you want the absolute highest quality inline math conversion, use this along with `--use_llm`. 114 | - `--disable_image_extraction`: Don't extract images from the PDF. If you also specify `--use_llm`, then images will be replaced with a description. 115 | - `--debug`: Enable debug mode for additional logging and diagnostic information. 116 | - `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"` 117 | - `--config_json PATH`: Path to a JSON configuration file containing additional settings. 118 | - `config --help`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults. 119 | - `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`. The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables. 120 | - `--llm_service`: Which llm service to use if `--use_llm` is passed. This defaults to `marker.services.gemini.GoogleGeminiService`. 121 | - `--help`: see all of the flags that can be passed into marker. (it supports many more options then are listed above) 122 | 123 | The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py). If you don't need OCR, marker can work with any language. 124 | 125 | ## Convert multiple files 126 | 127 | ```shell 128 | marker /path/to/input/folder 129 | ``` 130 | 131 | - `marker` supports all the same options from `marker_single` above. 132 | - `--workers` is the number of conversion workers to run simultaneously. This is automatically set by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average. 133 | 134 | ## Convert multiple files on multiple GPUs 135 | 136 | ```shell 137 | NUM_DEVICES=4 NUM_WORKERS=15 marker_chunk_convert ../pdf_in ../md_out 138 | ``` 139 | 140 | - `NUM_DEVICES` is the number of GPUs to use. Should be `2` or greater. 141 | - `NUM_WORKERS` is the number of parallel processes to run on each GPU. 142 | 143 | ## Use from python 144 | 145 | See the `PdfConverter` class at `marker/converters/pdf.py` function for additional arguments that can be passed. 146 | 147 | ```python 148 | from marker.converters.pdf import PdfConverter 149 | from marker.models import create_model_dict 150 | from marker.output import text_from_rendered 151 | 152 | converter = PdfConverter( 153 | artifact_dict=create_model_dict(), 154 | ) 155 | rendered = converter("FILEPATH") 156 | text, _, images = text_from_rendered(rendered) 157 | ``` 158 | 159 | `rendered` will be a pydantic basemodel with different properties depending on the output type requested. With markdown output (default), you'll have the properties `markdown`, `metadata`, and `images`. For json output, you'll have `children`, `block_type`, and `metadata`. 160 | 161 | ### Custom configuration 162 | 163 | You can pass configuration using the `ConfigParser`. To see all available options, do `marker_single --help`. 164 | 165 | ```python 166 | from marker.converters.pdf import PdfConverter 167 | from marker.models import create_model_dict 168 | from marker.config.parser import ConfigParser 169 | 170 | config = { 171 | "output_format": "json", 172 | "ADDITIONAL_KEY": "VALUE" 173 | } 174 | config_parser = ConfigParser(config) 175 | 176 | converter = PdfConverter( 177 | config=config_parser.generate_config_dict(), 178 | artifact_dict=create_model_dict(), 179 | processor_list=config_parser.get_processors(), 180 | renderer=config_parser.get_renderer(), 181 | llm_service=config_parser.get_llm_service() 182 | ) 183 | rendered = converter("FILEPATH") 184 | ``` 185 | 186 | ### Extract blocks 187 | 188 | Each document consists of one or more pages. Pages contain blocks, which can themselves contain other blocks. It's possible to programmatically manipulate these blocks. 189 | 190 | Here's an example of extracting all forms from a document: 191 | 192 | ```python 193 | from marker.converters.pdf import PdfConverter 194 | from marker.models import create_model_dict 195 | from marker.schema import BlockTypes 196 | 197 | converter = PdfConverter( 198 | artifact_dict=create_model_dict(), 199 | ) 200 | document = converter.build_document("FILEPATH") 201 | forms = document.contained_blocks((BlockTypes.Form,)) 202 | ``` 203 | 204 | Look at the processors for more examples of extracting and manipulating blocks. 205 | 206 | ## Other converters 207 | 208 | You can also use other converters that define different conversion pipelines: 209 | 210 | ### Extract tables 211 | 212 | The `TableConverter` will only convert and extract tables: 213 | 214 | ```python 215 | from marker.converters.table import TableConverter 216 | from marker.models import create_model_dict 217 | from marker.output import text_from_rendered 218 | 219 | converter = TableConverter( 220 | artifact_dict=create_model_dict(), 221 | ) 222 | rendered = converter("FILEPATH") 223 | text, _, images = text_from_rendered(rendered) 224 | ``` 225 | 226 | This takes all the same configuration as the PdfConverter. You can specify the configuration `force_layout_block=Table` to avoid layout detection and instead assume every page is a table. Set `output_format=json` to also get cell bounding boxes. 227 | 228 | You can also run this via the CLI with 229 | ```shell 230 | marker_single FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter --output_format json 231 | ``` 232 | 233 | ### OCR Only 234 | 235 | If you only want to run OCR, you can also do that through the `OCRConverter`. Set `--keep_chars` to keep individual characters and bounding boxes. 236 | 237 | ```python 238 | from marker.converters.ocr import OCRConverter 239 | from marker.models import create_model_dict 240 | 241 | converter = OCRConverter( 242 | artifact_dict=create_model_dict(), 243 | ) 244 | rendered = converter("FILEPATH") 245 | ``` 246 | 247 | This takes all the same configuration as the PdfConverter. 248 | 249 | You can also run this via the CLI with 250 | ```shell 251 | marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter 252 | ``` 253 | 254 | ### Structured Extraction (beta) 255 | 256 | You can run structured extraction via the `ExtractionConverter`. This requires an llm service to be setup first (see [here](#llm-services) for details). You'll get a JSON output with the extracted values. 257 | 258 | ```python 259 | from marker.converters.extraction import ExtractionConverter 260 | from marker.models import create_model_dict 261 | from marker.config.parser import ConfigParser 262 | from pydantic import BaseModel 263 | 264 | class Links(BaseModel): 265 | links: list[str] 266 | 267 | schema = Links.model_json_schema() 268 | config_parser = ConfigParser({ 269 | "page_schema": schema 270 | }) 271 | 272 | converter = ExtractionConverter( 273 | artifact_dict=create_model_dict(), 274 | config=config_parser.generate_config_dict(), 275 | llm_service=config_parser.get_llm_service(), 276 | ) 277 | rendered = converter("FILEPATH") 278 | ``` 279 | 280 | Rendered will have an `original_markdown` field. If you pass this back in next time you run the converter, as the `existing_markdown` config key, you can skip re-parsing the document. 281 | 282 | # Output Formats 283 | 284 | ## Markdown 285 | 286 | Markdown output will include: 287 | 288 | - image links (images will be saved in the same folder) 289 | - formatted tables 290 | - embedded LaTeX equations (fenced with `$$`) 291 | - Code is fenced with triple backticks 292 | - Superscripts for footnotes 293 | 294 | ## HTML 295 | 296 | HTML output is similar to markdown output: 297 | 298 | - Images are included via `img` tags 299 | - equations are fenced with `<math>` tags 300 | - code is in `pre` tags 301 | 302 | ## JSON 303 | 304 | JSON output will be organized in a tree-like structure, with the leaf nodes being blocks. Examples of leaf nodes are a single list item, a paragraph of text, or an image. 305 | 306 | The output will be a list, with each list item representing a page. Each page is considered a block in the internal marker schema. There are different types of blocks to represent different elements. 307 | 308 | Pages have the keys: 309 | 310 | - `id` - unique id for the block. 311 | - `block_type` - the type of block. The possible block types can be seen in `marker/schema/__init__.py`. As of this writing, they are ["Line", "Span", "FigureGroup", "TableGroup", "ListGroup", "PictureGroup", "Page", "Caption", "Code", "Figure", "Footnote", "Form", "Equation", "Handwriting", "TextInlineMath", "ListItem", "PageFooter", "PageHeader", "Picture", "SectionHeader", "Table", "Text", "TableOfContents", "Document"] 312 | - `html` - the HTML for the page. Note that this will have recursive references to children. The `content-ref` tags must be replaced with the child content if you want the full html. You can see an example of this at `marker/output.py:json_to_html`. That function will take in a single block from the json output, and turn it into HTML. 313 | - `polygon` - the 4-corner polygon of the page, in (x1,y1), (x2,y2), (x3, y3), (x4, y4) format. (x1,y1) is the top left, and coordinates go clockwise. 314 | - `children` - the child blocks. 315 | 316 | The child blocks have two additional keys: 317 | 318 | - `section_hierarchy` - indicates the sections that the block is part of. `1` indicates an h1 tag, `2` an h2, and so on. 319 | - `images` - base64 encoded images. The key will be the block id, and the data will be the encoded image. 320 | 321 | Note that child blocks of pages can have their own children as well (a tree structure). 322 | 323 | ```json 324 | { 325 | "id": "/page/10/Page/366", 326 | "block_type": "Page", 327 | "html": "<content-ref src='/page/10/SectionHeader/0'></content-ref><content-ref src='/page/10/SectionHeader/1'></content-ref><content-ref src='/page/10/Text/2'></content-ref><content-ref src='/page/10/Text/3'></content-ref><content-ref src='/page/10/Figure/4'></content-ref><content-ref src='/page/10/SectionHeader/5'></content-ref><content-ref src='/page/10/SectionHeader/6'></content-ref><content-ref src='/page/10/TextInlineMath/7'></content-ref><content-ref src='/page/10/TextInlineMath/8'></content-ref><content-ref src='/page/10/Table/9'></content-ref><content-ref src='/page/10/SectionHeader/10'></content-ref><content-ref src='/page/10/Text/11'></content-ref>", 328 | "polygon": [[0.0, 0.0], [612.0, 0.0], [612.0, 792.0], [0.0, 792.0]], 329 | "children": [ 330 | { 331 | "id": "/page/10/SectionHeader/0", 332 | "block_type": "SectionHeader", 333 | "html": "<h1>Supplementary Material for <i>Subspace Adversarial Training</i> </h1>", 334 | "polygon": [ 335 | [217.845703125, 80.630859375], [374.73046875, 80.630859375], 336 | [374.73046875, 107.0], 337 | [217.845703125, 107.0] 338 | ], 339 | "children": null, 340 | "section_hierarchy": { 341 | "1": "/page/10/SectionHeader/1" 342 | }, 343 | "images": {} 344 | }, 345 | ... 346 | ] 347 | } 348 | 349 | 350 | ``` 351 | 352 | ## Chunks 353 | 354 | Chunks format is similar to JSON, but flattens everything into a single list instead of a tree. Only the top level blocks from each page show up. It also has the full HTML of each block inside, so you don't need to crawl the tree to reconstruct it. This enable flexible and easy chunking for RAG. 355 | 356 | ## Metadata 357 | 358 | All output formats will return a metadata dictionary, with the following fields: 359 | 360 | ```json 361 | { 362 | "table_of_contents": [ 363 | { 364 | "title": "Introduction", 365 | "heading_level": 1, 366 | "page_id": 0, 367 | "polygon": [...] 368 | } 369 | ], // computed PDF table of contents 370 | "page_stats": [ 371 | { 372 | "page_id": 0, 373 | "text_extraction_method": "pdftext", 374 | "block_counts": [("Span", 200), ...] 375 | }, 376 | ... 377 | ] 378 | } 379 | ``` 380 | 381 | # LLM Services 382 | 383 | When running with the `--use_llm` flag, you have a choice of services you can use: 384 | 385 | - `Gemini` - this will use the Gemini developer API by default. You'll need to pass `--gemini_api_key` to configuration. 386 | - `Google Vertex` - this will use vertex, which can be more reliable. You'll need to pass `--vertex_project_id`. To use it, set `--llm_service=marker.services.vertex.GoogleVertexService`. 387 | - `Ollama` - this will use local models. You can configure `--ollama_base_url` and `--ollama_model`. To use it, set `--llm_service=marker.services.ollama.OllamaService`. 388 | - `Claude` - this will use the anthropic API. You can configure `--claude_api_key`, and `--claude_model_name`. To use it, set `--llm_service=marker.services.claude.ClaudeService`. 389 | - `OpenAI` - this supports any openai-like endpoint. You can configure `--openai_api_key`, `--openai_model`, and `--openai_base_url`. To use it, set `--llm_service=marker.services.openai.OpenAIService`. 390 | - `Azure OpenAI` - this uses the Azure OpenAI service. You can configure `--azure_endpoint`, `--azure_api_key`, and `--deployment_name`. To use it, set `--llm_service=marker.services.azure_openai.AzureOpenAIService`. 391 | 392 | These services may have additional optional configuration as well - you can see it by viewing the classes. 393 | 394 | # Internals 395 | 396 | Marker is easy to extend. The core units of marker are: 397 | 398 | - `Providers`, at `marker/providers`. These provide information from a source file, like a PDF. 399 | - `Builders`, at `marker/builders`. These generate the initial document blocks and fill in text, using info from the providers. 400 | - `Processors`, at `marker/processors`. These process specific blocks, for example the table formatter is a processor. 401 | - `Renderers`, at `marker/renderers`. These use the blocks to render output. 402 | - `Schema`, at `marker/schema`. The classes for all the block types. 403 | - `Converters`, at `marker/converters`. They run the whole end to end pipeline. 404 | 405 | To customize processing behavior, override the `processors`. To add new output formats, write a new `renderer`. For additional input formats, write a new `provider.` 406 | 407 | Processors and renderers can be directly passed into the base `PDFConverter`, so you can specify your own custom processing easily. 408 | 409 | ## API server 410 | 411 | There is a very simple API server you can run like this: 412 | 413 | ```shell 414 | pip install -U uvicorn fastapi python-multipart 415 | marker_server --port 8001 416 | ``` 417 | 418 | This will start a fastapi server that you can access at `localhost:8001`. You can go to `localhost:8001/docs` to see the endpoint options. 419 | 420 | You can send requests like this: 421 | 422 | ``` 423 | import requests 424 | import json 425 | 426 | post_data = { 427 | 'filepath': 'FILEPATH', 428 | # Add other params here 429 | } 430 | 431 | requests.post("http://localhost:8001/marker", data=json.dumps(post_data)).json() 432 | ``` 433 | 434 | Note that this is not a very robust API, and is only intended for small-scale use. If you want to use this server, but want a more robust conversion option, you can use the hosted [Datalab API](https://www.datalab.to/plans). 435 | 436 | # Troubleshooting 437 | 438 | There are some settings that you may find useful if things aren't working the way you expect: 439 | 440 | - If you have issues with accuracy, try setting `--use_llm` to use an LLM to improve quality. You must set `GOOGLE_API_KEY` to a Gemini API key for this to work. 441 | - Make sure to set `force_ocr` if you see garbled text - this will re-OCR the document. 442 | - `TORCH_DEVICE` - set this to force marker to use a given torch device for inference. 443 | - If you're getting out of memory errors, decrease worker count. You can also try splitting up long PDFs into multiple files. 444 | 445 | ## Debugging 446 | 447 | Pass the `debug` option to activate debug mode. This will save images of each page with detected layout and text, as well as output a json file with additional bounding box information. 448 | 449 | # Benchmarks 450 | 451 | ## Overall PDF Conversion 452 | 453 | We created a [benchmark set](https://huggingface.co/datasets/datalab-to/marker_benchmark) by extracting single PDF pages from common crawl. We scored based on a heuristic that aligns text with ground truth text segments, and an LLM as a judge scoring method. 454 | 455 | | Method | Avg Time | Heuristic Score | LLM Score | 456 | |------------|----------|-----------------|-----------| 457 | | marker | 2.83837 | 95.6709 | 4.23916 | 458 | | llamaparse | 23.348 | 84.2442 | 3.97619 | 459 | | mathpix | 6.36223 | 86.4281 | 4.15626 | 460 | | docling | 3.69949 | 86.7073 | 3.70429 | 461 | 462 | Benchmarks were run on an H100 for markjer and docling - llamaparse and mathpix used their cloud services. We can also look at it by document type: 463 | 464 | <img src="data/images/per_doc.png" width="1000px"/> 465 | 466 | | Document Type | Marker heuristic | Marker LLM | Llamaparse Heuristic | Llamaparse LLM | Mathpix Heuristic | Mathpix LLM | Docling Heuristic | Docling LLM | 467 | |----------------------|------------------|------------|----------------------|----------------|-------------------|-------------|-------------------|-------------| 468 | | Scientific paper | 96.6737 | 4.34899 | 87.1651 | 3.96421 | 91.2267 | 4.46861 | 92.135 | 3.72422 | 469 | | Book page | 97.1846 | 4.16168 | 90.9532 | 4.07186 | 93.8886 | 4.35329 | 90.0556 | 3.64671 | 470 | | Other | 95.1632 | 4.25076 | 81.1385 | 4.01835 | 79.6231 | 4.00306 | 83.8223 | 3.76147 | 471 | | Form | 88.0147 | 3.84663 | 66.3081 | 3.68712 | 64.7512 | 3.33129 | 68.3857 | 3.40491 | 472 | | Presentation | 95.1562 | 4.13669 | 81.2261 | 4 | 83.6737 | 3.95683 | 84.8405 | 3.86331 | 473 | | Financial document | 95.3697 | 4.39106 | 82.5812 | 4.16111 | 81.3115 | 4.05556 | 86.3882 | 3.8 | 474 | | Letter | 98.4021 | 4.5 | 93.4477 | 4.28125 | 96.0383 | 4.45312 | 92.0952 | 4.09375 | 475 | | Engineering document | 93.9244 | 4.04412 | 77.4854 | 3.72059 | 80.3319 | 3.88235 | 79.6807 | 3.42647 | 476 | | Legal document | 96.689 | 4.27759 | 86.9769 | 3.87584 | 91.601 | 4.20805 | 87.8383 | 3.65552 | 477 | | Newspaper page | 98.8733 | 4.25806 | 84.7492 | 3.90323 | 96.9963 | 4.45161 | 92.6496 | 3.51613 | 478 | | Magazine page | 98.2145 | 4.38776 | 87.2902 | 3.97959 | 93.5934 | 4.16327 | 93.0892 | 4.02041 | 479 | 480 | ## Throughput 481 | 482 | We benchmarked throughput using a [single long PDF](https://www.greenteapress.com/thinkpython/thinkpython.pdf). 483 | 484 | | Method | Time per page | Time per document | VRAM used | 485 | |---------|---------------|-------------------|---------- | 486 | | marker | 0.18 | 43.42 | 3.17GB | 487 | 488 | The projected throughput is 122 pages per second on an H100 - we can run 22 individual processes given the VRAM used. 489 | 490 | ## Table Conversion 491 | 492 | Marker can extract tables from PDFs using `marker.converters.table.TableConverter`. The table extraction performance is measured by comparing the extracted HTML representation of tables against the original HTML representations using the test split of [FinTabNet](https://developer.ibm.com/exchanges/data/all/fintabnet/). The HTML representations are compared using a tree edit distance based metric to judge both structure and content. Marker detects and identifies the structure of all tables in a PDF page and achieves these scores: 493 | 494 | | Method | Avg score | Total tables | 495 | |------------------|-----------|--------------| 496 | | marker | 0.816 | 99 | 497 | | marker w/use_llm | 0.907 | 99 | 498 | | gemini | 0.829 | 99 | 499 | 500 | The `--use_llm` flag can significantly improve table recognition performance, as you can see. 501 | 502 | We filter out tables that we cannot align with the ground truth, since fintabnet and our layout model have slightly different detection methods (this results in some tables being split/merged). 503 | 504 | ## Running your own benchmarks 505 | 506 | You can benchmark the performance of marker on your machine. Install marker manually with: 507 | 508 | ```shell 509 | git clone https://github.com/VikParuchuri/marker.git 510 | poetry install 511 | ``` 512 | 513 | ### Overall PDF Conversion 514 | 515 | Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this: 516 | 517 | ```shell 518 | python benchmarks/overall.py --methods marker --scores heuristic,llm 519 | ``` 520 | 521 | Options: 522 | 523 | - `--use_llm` use an llm to improve the marker results. 524 | - `--max_rows` how many rows to process for the benchmark. 525 | - `--methods` can be `llamaparse`, `mathpix`, `docling`, `marker`. Comma separated. 526 | - `--scores` which scoring functions to use, can be `llm`, `heuristic`. Comma separated. 527 | 528 | ### Table Conversion 529 | The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with: 530 | 531 | ```shell 532 | python benchmarks/table/table.py --max_rows 100 533 | ``` 534 | 535 | Options: 536 | 537 | - `--use_llm` uses an llm with marker to improve accuracy. 538 | - `--use_gemini` also benchmarks gemini 2.0 flash. 539 | 540 | # How it works 541 | 542 | Marker is a pipeline of deep learning models: 543 | 544 | - Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya)) 545 | - Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya)) 546 | - Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify), [surya](https://github.com/VikParuchuri/surya)) 547 | - Optionally use an LLM to improve quality 548 | - Combine blocks and postprocess complete text 549 | 550 | It only uses models where necessary, which improves speed and accuracy. 551 | 552 | # Limitations 553 | 554 | PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address: 555 | 556 | - Very complex layouts, with nested tables and forms, may not work 557 | - Forms may not be rendered well 558 | 559 | Note: Passing the `--use_llm` and `--force_ocr` flags will mostly solve these issues. 560 | 561 | # Usage and Deployment Examples 562 | 563 | You can always run `marker` locally, but if you wanted to expose it as an API, we have a few options: 564 | - Our platform API which is powered by `marker` and `surya` and is easy to test out - it's free to sign up, and we'll include credits, [try it out here](https://datalab.to) 565 | - Our painless on-prem solution for commercial use, which you can [read about here](https://www.datalab.to/blog/self-serve-on-prem-licensing) and gives you privacy guarantees with high throughput inference optimizations. 566 | - [Deployment example with Modal](./examples/README_MODAL.md) that shows you how to deploy and access `marker` through a web endpoint using [`Modal`](https://modal.com). Modal is an AI compute platform that enables developers to deploy and scale models on GPUs in minutes. 567 | ``` -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/display/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/download/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /benchmarks/table/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /benchmarks/throughput/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /marker/config/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /marker/scripts/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /marker/utils/__init__.py: -------------------------------------------------------------------------------- ```python 1 | ``` -------------------------------------------------------------------------------- /marker/schema/groups/base.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema.blocks import Block 2 | 3 | 4 | class Group(Block): 5 | pass ``` -------------------------------------------------------------------------------- /marker/schema/text/__init__.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema.text.line import Line 2 | from marker.schema.text.span import Span 3 | ``` -------------------------------------------------------------------------------- /marker_server.py: -------------------------------------------------------------------------------- ```python 1 | from marker.scripts.server import server_cli 2 | 3 | if __name__ == "__main__": 4 | server_cli() 5 | ``` -------------------------------------------------------------------------------- /convert.py: -------------------------------------------------------------------------------- ```python 1 | from marker.scripts.convert import convert_cli 2 | 3 | if __name__ == "__main__": 4 | convert_cli() 5 | ``` -------------------------------------------------------------------------------- /chunk_convert.py: -------------------------------------------------------------------------------- ```python 1 | from marker.scripts.chunk_convert import chunk_convert_cli 2 | 3 | if __name__ == "__main__": 4 | chunk_convert_cli() ``` -------------------------------------------------------------------------------- /convert_single.py: -------------------------------------------------------------------------------- ```python 1 | from marker.scripts.convert_single import convert_single_cli 2 | 3 | if __name__ == "__main__": 4 | convert_single_cli() 5 | ``` -------------------------------------------------------------------------------- /marker_app.py: -------------------------------------------------------------------------------- ```python 1 | from marker.scripts.run_streamlit_app import streamlit_app_cli 2 | 3 | if __name__ == "__main__": 4 | streamlit_app_cli() ``` -------------------------------------------------------------------------------- /extraction_app.py: -------------------------------------------------------------------------------- ```python 1 | from marker.scripts.run_streamlit_app import extraction_app_cli 2 | 3 | if __name__ == "__main__": 4 | extraction_app_cli() 5 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/schema.py: -------------------------------------------------------------------------------- ```python 1 | from typing import TypedDict, List 2 | 3 | 4 | class BenchmarkResult(TypedDict): 5 | markdown: str | List[str] 6 | time: float | None ``` -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- ``` 1 | [pytest] 2 | testpaths=tests 3 | markers = 4 | filename(name): specify the filename for the pdf_document fixture 5 | filterwarnings = 6 | ignore::Warning ``` -------------------------------------------------------------------------------- /benchmarks/overall/scorers/schema.py: -------------------------------------------------------------------------------- ```python 1 | from typing import TypedDict, List, Optional, Dict 2 | 3 | 4 | class BlockScores(TypedDict): 5 | score: float 6 | specific_scores: Dict[str, float | List[float]] 7 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/toc.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks.basetable import BaseTable 3 | 4 | 5 | class TableOfContents(BaseTable): 6 | block_type: str = BlockTypes.TableOfContents 7 | block_description: str = "A table of contents." 8 | ``` -------------------------------------------------------------------------------- /marker/schema/text/char.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Char(Block): 6 | block_type: BlockTypes = BlockTypes.Char 7 | block_description: str = "A single character inside a span." 8 | 9 | text: str 10 | idx: int 11 | ``` -------------------------------------------------------------------------------- /marker/providers/utils.py: -------------------------------------------------------------------------------- ```python 1 | def alphanum_ratio(text): 2 | text = text.replace(" ", "") 3 | text = text.replace("\n", "") 4 | alphanumeric_count = sum([1 for c in text if c.isalnum()]) 5 | 6 | if len(text) == 0: 7 | return 1 8 | 9 | ratio = alphanumeric_count / len(text) 10 | return ratio 11 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/table.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks.basetable import BaseTable 3 | 4 | 5 | class Table(BaseTable): 6 | block_type: BlockTypes = BlockTypes.Table 7 | block_description: str = "A table of data, like a results table. It will be in a tabular format." 8 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/scorers/__init__.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List 2 | 3 | from benchmarks.overall.scorers.schema import BlockScores 4 | 5 | 6 | class BaseScorer: 7 | def __init__(self): 8 | pass 9 | 10 | def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores: 11 | raise NotImplementedError() ``` -------------------------------------------------------------------------------- /tests/builders/test_structure.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.builders.structure import StructureBuilder 4 | 5 | 6 | @pytest.mark.config({"page_range": [0]}) 7 | def test_structure_builder(pdf_document): 8 | structure = StructureBuilder() 9 | structure(pdf_document) 10 | assert len(pdf_document.pages[0].structure) > 0 11 | ``` -------------------------------------------------------------------------------- /marker/schema/groups/__init__.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema.blocks.base import Block 2 | from marker.schema.groups.figure import FigureGroup 3 | from marker.schema.groups.table import TableGroup 4 | from marker.schema.groups.list import ListGroup 5 | from marker.schema.groups.picture import PictureGroup 6 | from marker.schema.groups.page import PageGroup 7 | ``` -------------------------------------------------------------------------------- /marker/builders/__init__.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Optional 2 | 3 | from pydantic import BaseModel 4 | 5 | from marker.util import assign_config 6 | 7 | 8 | class BaseBuilder: 9 | def __init__(self, config: Optional[BaseModel | dict] = None): 10 | assign_config(self, config) 11 | 12 | def __call__(self, data, *args, **kwargs): 13 | raise NotImplementedError 14 | ``` -------------------------------------------------------------------------------- /tests/builders/test_ocr_builder.py: -------------------------------------------------------------------------------- ```python 1 | from PIL import Image 2 | 3 | from marker.builders.ocr import OcrBuilder 4 | 5 | 6 | def test_blank_char_builder(recognition_model): 7 | builder = OcrBuilder(recognition_model) 8 | image = Image.new("RGB", (100, 100)) 9 | spans = builder.spans_from_html_chars([], None, image) # Test with empty char list 10 | assert len(spans) == 0 11 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/form.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.blocks.basetable import BaseTable 5 | 6 | 7 | class Form(BaseTable): 8 | block_type: BlockTypes = BlockTypes.Form 9 | block_description: str = "A form, such as a tax form, that contains fields and labels. It most likely doesn't have a table structure." 10 | ``` -------------------------------------------------------------------------------- /tests/renderers/test_json_renderer.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.renderers.json import JSONRenderer 4 | 5 | 6 | @pytest.mark.config({"page_range": [0]}) 7 | def test_markdown_renderer_pagination(pdf_document): 8 | renderer = JSONRenderer() 9 | pages = renderer(pdf_document).children 10 | 11 | assert len(pages) == 1 12 | assert pages[0].block_type == "Page" 13 | assert pages[0].children[0].block_type == "SectionHeader" ``` -------------------------------------------------------------------------------- /benchmarks/overall/schema.py: -------------------------------------------------------------------------------- ```python 1 | from typing import TypedDict, List, Dict 2 | 3 | from benchmarks.overall.scorers.schema import BlockScores 4 | 5 | AVG_TYPE = Dict[str, Dict[str, Dict[str, List[float]]]] 6 | 7 | class FullResult(TypedDict): 8 | scores: Dict[int, Dict[str, Dict[str, BlockScores]]] 9 | averages_by_type: AVG_TYPE 10 | averages_by_block_type: AVG_TYPE 11 | average_times: Dict[str, List[float]] 12 | markdown: Dict[int, Dict[str, str]] 13 | ``` -------------------------------------------------------------------------------- /tests/builders/test_strip_existing_ocr.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | 4 | @pytest.mark.config({"page_range": [0], "strip_existing_ocr": True}) 5 | @pytest.mark.filename("handwritten.pdf") 6 | def test_strip_ocr(doc_provider): 7 | # Ensure that the OCR text isn't extracted 8 | assert len(doc_provider.page_lines) == 0 9 | 10 | 11 | @pytest.mark.config({"page_range": [0]}) 12 | @pytest.mark.filename("handwritten.pdf") 13 | def test_keep_ocr(doc_provider): 14 | assert len(doc_provider.page_lines) == 1 15 | ``` -------------------------------------------------------------------------------- /tests/renderers/test_html_renderer.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.renderers.html import HTMLRenderer 4 | 5 | 6 | @pytest.mark.config( 7 | { 8 | "page_range": [0], 9 | "disable_ocr": True, 10 | "add_block_ids": True, 11 | "paginate_output": True, 12 | } 13 | ) 14 | def test_html_renderer_block_ids(pdf_document, config): 15 | renderer = HTMLRenderer(config) 16 | html = renderer(pdf_document).html 17 | 18 | # Verify some block IDs are present 19 | assert "/page/0/Text/1" in html 20 | ``` -------------------------------------------------------------------------------- /tests/processors/test_document_toc_processor.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.processors.document_toc import DocumentTOCProcessor 4 | 5 | 6 | @pytest.mark.config({"page_range": [0]}) 7 | def test_document_toc_processor(pdf_document, detection_model, recognition_model, table_rec_model): 8 | processor = DocumentTOCProcessor() 9 | processor(pdf_document) 10 | 11 | assert len(pdf_document.table_of_contents) == 4 12 | assert pdf_document.table_of_contents[0]["title"] == "Subspace Adversarial Training" 13 | ``` -------------------------------------------------------------------------------- /tests/processors/test_equation_processor.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.schema import BlockTypes 4 | from marker.processors.equation import EquationProcessor 5 | 6 | 7 | @pytest.mark.config({"page_range": [0]}) 8 | def test_equation_processor(pdf_document, recognition_model): 9 | processor = EquationProcessor(recognition_model) 10 | processor(pdf_document) 11 | 12 | for block in pdf_document.pages[0].children: 13 | if block.block_type == BlockTypes.Equation: 14 | assert block.html is not None ``` -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- ```python 1 | from marker.providers.pdf import PdfProvider 2 | import tempfile 3 | 4 | import datasets 5 | 6 | 7 | def setup_pdf_provider( 8 | filename='adversarial.pdf', 9 | config=None, 10 | ) -> PdfProvider: 11 | dataset = datasets.load_dataset("datalab-to/pdfs", split="train") 12 | idx = dataset['filename'].index(filename) 13 | 14 | temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf") 15 | temp_pdf.write(dataset['pdf'][idx]) 16 | temp_pdf.flush() 17 | 18 | provider = PdfProvider(temp_pdf.name, config) 19 | return provider 20 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/code.py: -------------------------------------------------------------------------------- ```python 1 | import html 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.blocks import Block 5 | 6 | 7 | class Code(Block): 8 | block_type: BlockTypes = BlockTypes.Code 9 | code: str | None = None 10 | html: str | None = None 11 | block_description: str = "A programming code block." 12 | 13 | def assemble_html(self, document, child_blocks, parent_structure, block_config): 14 | if self.html: 15 | return self.html 16 | code = self.code or "" 17 | return f"<pre>{html.escape(code)}</pre>" 18 | ``` -------------------------------------------------------------------------------- /marker/processors/__init__.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Optional, Tuple 2 | 3 | from pydantic import BaseModel 4 | 5 | from marker.schema import BlockTypes 6 | from marker.schema.document import Document 7 | from marker.util import assign_config 8 | 9 | 10 | class BaseProcessor: 11 | block_types: Tuple[BlockTypes] | None = None # What block types this processor is responsible for 12 | 13 | def __init__(self, config: Optional[BaseModel | dict] = None): 14 | assign_config(self, config) 15 | 16 | def __call__(self, document: Document, *args, **kwargs): 17 | raise NotImplementedError 18 | ``` -------------------------------------------------------------------------------- /tests/processors/test_footnote_processor.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.processors.footnote import FootnoteProcessor 4 | from marker.schema import BlockTypes 5 | 6 | 7 | @pytest.mark.filename("population_stats.pdf") 8 | @pytest.mark.config({"page_range": [4]}) 9 | def test_footnote_processor(pdf_document): 10 | processor = FootnoteProcessor() 11 | processor(pdf_document) 12 | 13 | page0_footnotes = pdf_document.pages[0].contained_blocks(pdf_document, [BlockTypes.Footnote]) 14 | assert len(page0_footnotes) >= 2 15 | 16 | assert page0_footnotes[-1].raw_text(pdf_document).strip().startswith("5") 17 | ``` -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEAT]" 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## ✨ Is your feature request related to a problem? 11 | 12 | A clear and concise description of what the problem is. 13 | 14 | ## 💡 Describe the Solution You'd Like 15 | 16 | A concise description of what you want to happen or how you envision it working. 17 | 18 | ## 📋 Alternatives Considered 19 | 20 | Any alternative solutions or workarounds you've tried. 21 | 22 | ## 🧩 Additional Context 23 | 24 | Any additional context, references, or related issues. 25 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/reference.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Reference(Block): 6 | block_type: BlockTypes = BlockTypes.Reference 7 | ref: str 8 | block_description: str = "A reference to this block from another block." 9 | 10 | def assemble_html( 11 | self, document, child_blocks, parent_structure=None, block_config=None 12 | ): 13 | template = super().assemble_html( 14 | document, child_blocks, parent_structure, block_config 15 | ) 16 | return f"<span id='{self.ref}'>{template}</span>" 17 | ``` -------------------------------------------------------------------------------- /marker/utils/batch.py: -------------------------------------------------------------------------------- ```python 1 | from marker.utils.gpu import GPUManager 2 | 3 | 4 | def get_batch_sizes_worker_counts(gpu_manager: GPUManager, peak_worker_vram: int): 5 | vram = gpu_manager.get_gpu_vram() 6 | 7 | workers = max(1, vram // peak_worker_vram) 8 | if workers == 1: 9 | return {}, workers 10 | 11 | return { 12 | "layout_batch_size": 12, 13 | "detection_batch_size": 8, 14 | "table_rec_batch_size": 12, 15 | "ocr_error_batch_size": 12, 16 | "recognition_batch_size": 64, 17 | "equation_batch_size": 16, 18 | "detector_postprocessing_cpu_workers": 2, 19 | }, workers 20 | ``` -------------------------------------------------------------------------------- /tests/providers/test_pdf_provider.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | 4 | @pytest.mark.config({"page_range": [0]}) 5 | def test_pdf_provider(doc_provider): 6 | assert len(doc_provider) == 12 7 | assert doc_provider.get_images([0], 72)[0].size == (612, 792) 8 | assert doc_provider.get_images([0], 96)[0].size == (816, 1056) 9 | 10 | page_lines = doc_provider.get_page_lines(0) 11 | assert len(page_lines) == 85 12 | 13 | spans = page_lines[0].spans 14 | assert len(spans) == 2 15 | assert spans[0].text == "Subspace Adversarial Training" 16 | assert spans[0].font == "NimbusRomNo9L-Medi" 17 | assert spans[0].formats == ["plain"] 18 | ``` -------------------------------------------------------------------------------- /tests/providers/test_image_provider.py: -------------------------------------------------------------------------------- ```python 1 | from marker.providers.image import ImageProvider 2 | from marker.renderers.markdown import MarkdownOutput 3 | 4 | 5 | def test_image_provider(config, temp_image): 6 | provider = ImageProvider(temp_image.name, config) 7 | assert len(provider) == 1 8 | assert provider.get_images([0], 72)[0].size == (512, 512) 9 | 10 | page_lines = provider.get_page_lines(0) 11 | assert len(page_lines) == 0 12 | 13 | def test_image_provider_conversion(pdf_converter, temp_image): 14 | markdown_output: MarkdownOutput = pdf_converter(temp_image.name) 15 | assert "Hello, World!" in markdown_output.markdown 16 | 17 | 18 | ``` -------------------------------------------------------------------------------- /marker/schema/groups/picture.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.groups.base import Group 3 | 4 | 5 | class PictureGroup(Group): 6 | block_type: BlockTypes = BlockTypes.PictureGroup 7 | block_description: str = "A picture along with associated captions." 8 | html: str | None = None 9 | 10 | def assemble_html( 11 | self, document, child_blocks, parent_structure, block_config=None 12 | ): 13 | if self.html: 14 | return self.html 15 | 16 | child_html = super().assemble_html( 17 | document, child_blocks, parent_structure, block_config 18 | ) 19 | return child_html 20 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/mathpix.py: -------------------------------------------------------------------------------- ```python 1 | import datasets 2 | 3 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult 4 | 5 | 6 | class MathpixMethod(BaseMethod): 7 | mathpix_ds: datasets.Dataset = None 8 | 9 | def __call__(self, sample) -> BenchmarkResult: 10 | uuid = sample["uuid"] 11 | data = None 12 | for row in self.mathpix_ds: 13 | if str(row["uuid"]) == str(uuid): 14 | data = row 15 | break 16 | if not data: 17 | raise ValueError(f"Could not find data for uuid {uuid}") 18 | 19 | return { 20 | "markdown": data["md"], 21 | "time": data["time"] 22 | } ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/mistral.py: -------------------------------------------------------------------------------- ```python 1 | import datasets 2 | 3 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult 4 | 5 | 6 | class MistralMethod(BaseMethod): 7 | mistral_ds: datasets.Dataset = None 8 | 9 | def __call__(self, sample) -> BenchmarkResult: 10 | uuid = sample["uuid"] 11 | data = None 12 | for row in self.mistral_ds: 13 | if str(row["uuid"]) == str(uuid): 14 | data = row 15 | break 16 | if not data: 17 | raise ValueError(f"Could not find data for uuid {uuid}") 18 | 19 | return { 20 | "markdown": data["md"], 21 | "time": data["time"] 22 | } ``` -------------------------------------------------------------------------------- /marker/scripts/run_streamlit_app.py: -------------------------------------------------------------------------------- ```python 1 | import subprocess 2 | import os 3 | import sys 4 | 5 | 6 | def streamlit_app_cli(app_name: str = "streamlit_app.py"): 7 | argv = sys.argv[1:] 8 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 9 | app_path = os.path.join(cur_dir, app_name) 10 | cmd = [ 11 | "streamlit", 12 | "run", 13 | app_path, 14 | "--server.fileWatcherType", 15 | "none", 16 | "--server.headless", 17 | "true", 18 | ] 19 | if argv: 20 | cmd += ["--"] + argv 21 | subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"}) 22 | 23 | 24 | def extraction_app_cli(): 25 | streamlit_app_cli("extraction_app.py") 26 | ``` -------------------------------------------------------------------------------- /marker/schema/groups/figure.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.groups.base import Group 3 | 4 | 5 | class FigureGroup(Group): 6 | block_type: BlockTypes = BlockTypes.FigureGroup 7 | block_description: str = "A group that contains a figure and associated captions." 8 | html: str | None = None 9 | 10 | def assemble_html( 11 | self, document, child_blocks, parent_structure, block_config=None 12 | ): 13 | if self.html: 14 | return self.html 15 | 16 | child_html = super().assemble_html( 17 | document, child_blocks, parent_structure, block_config 18 | ) 19 | return child_html 20 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/llamaparse.py: -------------------------------------------------------------------------------- ```python 1 | import datasets 2 | 3 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult 4 | 5 | 6 | class LlamaParseMethod(BaseMethod): 7 | llamaparse_ds: datasets.Dataset = None 8 | 9 | def __call__(self, sample) -> BenchmarkResult: 10 | uuid = sample["uuid"] 11 | data = None 12 | for row in self.llamaparse_ds: 13 | if str(row["uuid"]) == str(uuid): 14 | data = row 15 | break 16 | if not data: 17 | raise ValueError(f"Could not find data for uuid {uuid}") 18 | 19 | return { 20 | "markdown": data["md"], 21 | "time": data["time"] 22 | } ``` -------------------------------------------------------------------------------- /marker/schema/blocks/handwriting.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Handwriting(Block): 6 | block_type: BlockTypes = BlockTypes.Handwriting 7 | block_description: str = "A region that contains handwriting." 8 | html: str | None = None 9 | replace_output_newlines: bool = True 10 | 11 | def assemble_html( 12 | self, document, child_blocks, parent_structure, block_config=None 13 | ): 14 | if self.html: 15 | return self.html 16 | else: 17 | return super().assemble_html( 18 | document, child_blocks, parent_structure, block_config 19 | ) 20 | ``` -------------------------------------------------------------------------------- /tests/schema/groups/test_list_grouping.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.builders.structure import StructureBuilder 4 | from marker.schema import BlockTypes 5 | 6 | 7 | @pytest.mark.config({"page_range": [4]}) 8 | @pytest.mark.skip(reason="Model breaks this up due to equations") 9 | def test_list_grouping(pdf_document): 10 | structure = StructureBuilder() 11 | structure(pdf_document) 12 | 13 | page = pdf_document.pages[0] 14 | list_groups = [] 15 | for block in page.children: 16 | if block.block_type == BlockTypes.ListGroup: 17 | list_groups.append(block) 18 | 19 | # The model breaks this up, since it has equations in it 20 | assert len(list_groups) == 3 21 | ``` -------------------------------------------------------------------------------- /tests/renderers/test_extract_images.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.renderers.markdown import MarkdownRenderer 4 | 5 | 6 | @pytest.mark.config({"page_range": [0]}) 7 | @pytest.mark.filename("A17_FlightPlan.pdf") 8 | def test_disable_extract_images(pdf_document): 9 | renderer = MarkdownRenderer({"extract_images": False}) 10 | md = renderer(pdf_document).markdown 11 | 12 | # Verify markdown 13 | assert "jpeg" not in md 14 | 15 | 16 | @pytest.mark.config({"page_range": [0]}) 17 | @pytest.mark.filename("A17_FlightPlan.pdf") 18 | def test_extract_images(pdf_document): 19 | renderer = MarkdownRenderer() 20 | md = renderer(pdf_document).markdown 21 | 22 | # Verify markdown 23 | assert "jpeg" in md ``` -------------------------------------------------------------------------------- /marker/renderers/extraction.py: -------------------------------------------------------------------------------- ```python 1 | from pydantic import BaseModel 2 | 3 | from marker.extractors.document import DocumentExtractionSchema 4 | from marker.renderers import BaseRenderer 5 | 6 | 7 | class ExtractionOutput(BaseModel): 8 | analysis: str 9 | document_json: str 10 | original_markdown: str 11 | 12 | 13 | class ExtractionRenderer(BaseRenderer): 14 | def __call__( 15 | self, output: DocumentExtractionSchema, markdown: str 16 | ) -> ExtractionOutput: 17 | # We definitely want to do more complex stuff here soon, so leave it in 18 | return ExtractionOutput( 19 | analysis=output.analysis, 20 | document_json=output.document_json, 21 | original_markdown=markdown, 22 | ) 23 | ``` -------------------------------------------------------------------------------- /marker/scripts/chunk_convert.py: -------------------------------------------------------------------------------- ```python 1 | import argparse 2 | import os 3 | import subprocess 4 | import pkg_resources 5 | 6 | 7 | def chunk_convert_cli(): 8 | parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.") 9 | parser.add_argument("in_folder", help="Input folder with pdfs.") 10 | parser.add_argument("out_folder", help="Output folder") 11 | args = parser.parse_args() 12 | 13 | cur_dir = os.path.dirname(os.path.abspath(__file__)) 14 | script_path = os.path.join(cur_dir, "chunk_convert.sh") 15 | 16 | # Construct the command 17 | cmd = f"{script_path} {args.in_folder} {args.out_folder}" 18 | 19 | # Execute the shell script 20 | subprocess.run(cmd, shell=True, check=True) ``` -------------------------------------------------------------------------------- /tests/processors/test_ignoretext.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.processors.ignoretext import IgnoreTextProcessor 4 | from marker.schema import BlockTypes 5 | 6 | 7 | @pytest.mark.filename("bio_pdf.pdf") 8 | @pytest.mark.config({"page_range": list(range(10))}) 9 | @pytest.mark.skip(reason="New layout model correctly identifies the block as a PageHeader, so nothing to be done by the IgnoreTextProcessor") 10 | def test_ignoretext_processor(pdf_document): 11 | processor = IgnoreTextProcessor() 12 | processor(pdf_document) 13 | 14 | page1_header = pdf_document.pages[1].contained_blocks(pdf_document, [BlockTypes.Text])[0] 15 | assert "bioRxiv" in page1_header.raw_text(pdf_document) 16 | 17 | assert page1_header.ignore_for_output is True 18 | ``` -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: CI tests 2 | 3 | on: [push] 4 | 5 | jobs: 6 | tests: 7 | runs-on: t4_gpu 8 | steps: 9 | - uses: actions/checkout@v3 10 | - name: Install apt requirements 11 | run: | 12 | sudo apt-get update 13 | sudo apt-get install -y libpango-1.0-0 libharfbuzz0b libpangoft2-1.0-0 libgdk-pixbuf2.0-0 libcairo2 libffi-dev shared-mime-info 14 | - name: Set up Python 3.11 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: 3.11 18 | - name: Install python dependencies 19 | run: | 20 | pip install poetry 21 | poetry install --extras "full" 22 | - name: Run tests 23 | env: 24 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 25 | run: poetry run pytest 26 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/footnote.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Footnote(Block): 6 | block_type: BlockTypes = BlockTypes.Footnote 7 | block_description: str = ( 8 | "A footnote that explains a term or concept in the document." 9 | ) 10 | replace_output_newlines: bool = True 11 | html: str | None = None 12 | 13 | def assemble_html( 14 | self, document, child_blocks, parent_structure, block_config=None 15 | ): 16 | if self.html: 17 | return super().handle_html_output( 18 | document, child_blocks, parent_structure, block_config 19 | ) 20 | 21 | return super().assemble_html( 22 | document, child_blocks, parent_structure, block_config 23 | ) 24 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/gt.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List 2 | import json 3 | 4 | from PIL import Image 5 | 6 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult 7 | 8 | 9 | class GTMethod(BaseMethod): 10 | def __call__(self, sample) -> BenchmarkResult: 11 | gt_blocks = json.loads(sample["gt_blocks"]) 12 | gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0] 13 | gt_markdown = [self.convert_to_md(block) for block in gt_html] 14 | return { 15 | "markdown": gt_markdown, 16 | "time": 0 17 | } 18 | 19 | def render(self, html: List[str]) -> Image.Image: 20 | joined = "\n\n".join(html) 21 | html = f""" 22 | <html> 23 | <head></head> 24 | <body> 25 | {joined} 26 | </body> 27 | </html> 28 | """.strip() 29 | return self.html_to_image(html) ``` -------------------------------------------------------------------------------- /marker/schema/blocks/caption.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Caption(Block): 6 | block_type: BlockTypes = BlockTypes.Caption 7 | block_description: str = "A text caption that is directly above or below an image or table. Only used for text describing the image or table. " 8 | replace_output_newlines: bool = True 9 | html: str | None = None 10 | 11 | def assemble_html(self, document, child_blocks, parent_structure, block_config): 12 | if self.html: 13 | return super().handle_html_output( 14 | document, child_blocks, parent_structure, block_config 15 | ) 16 | 17 | return super().assemble_html( 18 | document, child_blocks, parent_structure, block_config 19 | ) 20 | ``` -------------------------------------------------------------------------------- /marker/schema/__init__.py: -------------------------------------------------------------------------------- ```python 1 | from enum import auto, Enum 2 | 3 | 4 | class BlockTypes(str, Enum): 5 | Line = auto() 6 | Span = auto() 7 | Char = auto() 8 | FigureGroup = auto() 9 | TableGroup = auto() 10 | ListGroup = auto() 11 | PictureGroup = auto() 12 | Page = auto() 13 | Caption = auto() 14 | Code = auto() 15 | Figure = auto() 16 | Footnote = auto() 17 | Form = auto() 18 | Equation = auto() 19 | Handwriting = auto() 20 | TextInlineMath = auto() 21 | ListItem = auto() 22 | PageFooter = auto() 23 | PageHeader = auto() 24 | Picture = auto() 25 | SectionHeader = auto() 26 | Table = auto() 27 | Text = auto() 28 | TableOfContents = auto() 29 | Document = auto() 30 | ComplexRegion = auto() 31 | TableCell = auto() 32 | Reference = auto() 33 | 34 | def __str__(self): 35 | return self.name 36 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/docling.py: -------------------------------------------------------------------------------- ```python 1 | import tempfile 2 | import time 3 | 4 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult 5 | 6 | 7 | class DoclingMethod(BaseMethod): 8 | model_dict: dict = None 9 | use_llm: bool = False 10 | 11 | def __call__(self, sample) -> BenchmarkResult: 12 | from docling.document_converter import DocumentConverter 13 | pdf_bytes = sample["pdf"] # This is a single page PDF 14 | converter = DocumentConverter() 15 | 16 | with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: 17 | f.write(pdf_bytes) 18 | start = time.time() 19 | result = converter.convert(f.name) 20 | total = time.time() - start 21 | 22 | return { 23 | "markdown": result.document.export_to_markdown(), 24 | "time": total 25 | } 26 | 27 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/pagefooter.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class PageFooter(Block): 6 | block_type: str = BlockTypes.PageFooter 7 | block_description: str = ( 8 | "Text that appears at the bottom of a page, like a page number." 9 | ) 10 | replace_output_newlines: bool = True 11 | ignore_for_output: bool = True 12 | html: str | None = None 13 | 14 | def assemble_html(self, document, child_blocks, parent_structure, block_config): 15 | if block_config and block_config.get("keep_pagefooter_in_output"): 16 | self.ignore_for_output = False 17 | 18 | if self.html and not self.ignore_for_output: 19 | return self.html 20 | 21 | return super().assemble_html( 22 | document, child_blocks, parent_structure, block_config 23 | ) 24 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/pageheader.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class PageHeader(Block): 6 | block_type: BlockTypes = BlockTypes.PageHeader 7 | block_description: str = ( 8 | "Text that appears at the top of a page, like a page title." 9 | ) 10 | replace_output_newlines: bool = True 11 | ignore_for_output: bool = True 12 | html: str | None = None 13 | 14 | def assemble_html(self, document, child_blocks, parent_structure, block_config): 15 | if block_config and block_config.get("keep_pageheader_in_output"): 16 | self.ignore_for_output = False 17 | 18 | if self.html and not self.ignore_for_output: 19 | return self.html 20 | 21 | return super().assemble_html( 22 | document, child_blocks, parent_structure, block_config 23 | ) 24 | ``` -------------------------------------------------------------------------------- /marker/schema/groups/table.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.blocks import BlockOutput 5 | from marker.schema.groups.base import Group 6 | 7 | 8 | class TableGroup(Group): 9 | block_type: BlockTypes = BlockTypes.TableGroup 10 | block_description: str = "A table along with associated captions." 11 | html: str | None = None 12 | 13 | def assemble_html( 14 | self, 15 | document, 16 | child_blocks: List[BlockOutput], 17 | parent_structure=None, 18 | block_config: dict | None = None, 19 | ): 20 | if self.html: 21 | return self.handle_html_output( 22 | document, child_blocks, parent_structure, block_config 23 | ) 24 | 25 | return super().assemble_html( 26 | document, child_blocks, parent_structure, block_config 27 | ) 28 | ``` -------------------------------------------------------------------------------- /tests/builders/test_rotated_bboxes.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.schema import BlockTypes 4 | 5 | 6 | @pytest.mark.config({"page_range": [0]}) 7 | @pytest.mark.filename("adversarial_rot.pdf") 8 | def test_rotated_bboxes(pdf_document): 9 | first_page = pdf_document.pages[0] 10 | 11 | # Ensure we match all text lines up properly 12 | text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,)) 13 | text_blocks = first_page.contained_blocks( 14 | pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath) 15 | ) 16 | # assert len(text_lines) == 84 17 | 18 | # Ensure the bbox sizes match up 19 | max_line_position = max([line.polygon.x_end for line in text_lines]) 20 | max_block_position = max( 21 | [block.polygon.x_end for block in text_blocks if block.source == "layout"] 22 | ) 23 | assert max_line_position <= max_block_position 24 | ``` -------------------------------------------------------------------------------- /marker/processors/document_toc.py: -------------------------------------------------------------------------------- ```python 1 | from marker.processors import BaseProcessor 2 | from marker.schema import BlockTypes 3 | from marker.schema.document import Document 4 | 5 | 6 | class DocumentTOCProcessor(BaseProcessor): 7 | """ 8 | A processor for generating a table of contents for the document. 9 | """ 10 | block_types = (BlockTypes.SectionHeader, ) 11 | 12 | def __call__(self, document: Document): 13 | toc = [] 14 | for page in document.pages: 15 | for block in page.contained_blocks(document, self.block_types): 16 | toc.append({ 17 | "title": block.raw_text(document).strip(), 18 | "heading_level": block.heading_level, 19 | "page_id": page.page_id, 20 | "polygon": block.polygon.polygon 21 | }) 22 | document.table_of_contents = toc 23 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/registry.py: -------------------------------------------------------------------------------- ```python 1 | from benchmarks.overall.methods.docling import DoclingMethod 2 | from benchmarks.overall.methods.gt import GTMethod 3 | from benchmarks.overall.methods.llamaparse import LlamaParseMethod 4 | from benchmarks.overall.methods.marker import MarkerMethod 5 | from benchmarks.overall.methods.mathpix import MathpixMethod 6 | from benchmarks.overall.methods.mistral import MistralMethod 7 | from benchmarks.overall.methods.olmocr import OlmOCRMethod 8 | from benchmarks.overall.scorers.heuristic import HeuristicScorer 9 | from benchmarks.overall.scorers.llm import LLMScorer 10 | 11 | SCORE_REGISTRY = { 12 | "heuristic": HeuristicScorer, 13 | "llm": LLMScorer 14 | } 15 | 16 | METHOD_REGISTRY = { 17 | "marker": MarkerMethod, 18 | "gt": GTMethod, 19 | "mathpix": MathpixMethod, 20 | "llamaparse": LlamaParseMethod, 21 | "docling": DoclingMethod, 22 | "olmocr": OlmOCRMethod, 23 | "mistral": MistralMethod 24 | } ``` -------------------------------------------------------------------------------- /marker/processors/page_header.py: -------------------------------------------------------------------------------- ```python 1 | from marker.processors import BaseProcessor 2 | from marker.schema import BlockTypes 3 | from marker.schema.document import Document 4 | from marker.schema.groups.page import PageGroup 5 | 6 | 7 | class PageHeaderProcessor(BaseProcessor): 8 | """ 9 | A processor for moving PageHeaders to the top 10 | """ 11 | block_types = (BlockTypes.PageHeader,) 12 | 13 | def __call__(self, document: Document): 14 | for page in document.pages: 15 | self.move_page_header_to_top(page, document) 16 | 17 | def move_page_header_to_top(self, page: PageGroup, document: Document): 18 | page_header_blocks = page.contained_blocks(document, self.block_types) 19 | page_header_block_ids = [block.id for block in page_header_blocks] 20 | for block_id in page_header_block_ids: 21 | page.structure.remove(block_id) 22 | page.structure[:0] = page_header_block_ids 23 | 24 | ``` -------------------------------------------------------------------------------- /marker/schema/groups/list.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.groups.base import Group 3 | 4 | 5 | class ListGroup(Group): 6 | block_type: BlockTypes = BlockTypes.ListGroup 7 | has_continuation: bool = False 8 | block_description: str = "A group of list items that should be rendered together." 9 | html: str | None = None 10 | 11 | def assemble_html( 12 | self, document, child_blocks, parent_structure, block_config=None 13 | ): 14 | if self.html: 15 | return self.handle_html_output( 16 | document, child_blocks, parent_structure, block_config 17 | ) 18 | 19 | template = super().assemble_html( 20 | document, child_blocks, parent_structure, block_config 21 | ) 22 | 23 | el_attr = f" block-type='{self.block_type}'" 24 | if self.has_continuation: 25 | el_attr += " class='has-continuation'" 26 | return f"<p{el_attr}><ul>{template}</ul></p>" 27 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/download/main.py: -------------------------------------------------------------------------------- ```python 1 | import click 2 | 3 | from benchmarks.overall.download.llamaparse import LlamaParseDownloader 4 | from benchmarks.overall.download.mathpix import MathpixDownloader 5 | from benchmarks.overall.download.mistral import MistralDownloader 6 | 7 | 8 | @click.command("Download data from inference services") 9 | @click.argument("service", type=click.Choice(["mathpix", "llamaparse", "mistral"])) 10 | @click.option("--max_rows", type=int, default=2200) 11 | @click.option("--api_key", type=str, default=None) 12 | @click.option("--app_id", type=str, default=None) 13 | def main(service: str, max_rows: int, api_key: str, app_id: str): 14 | registry = { 15 | "mathpix": MathpixDownloader, 16 | "llamaparse": LlamaParseDownloader, 17 | "mistral": MistralDownloader, 18 | } 19 | downloader = registry[service](api_key, app_id, max_rows=max_rows) 20 | 21 | # Generate data and upload to hub 22 | downloader() 23 | 24 | if __name__ == "__main__": 25 | main() 26 | ``` -------------------------------------------------------------------------------- /marker/logger.py: -------------------------------------------------------------------------------- ```python 1 | import logging 2 | import warnings 3 | 4 | from marker.settings import settings 5 | 6 | 7 | def configure_logging(): 8 | # Setup marker logger 9 | logger = get_logger() 10 | 11 | if not logger.handlers: 12 | handler = logging.StreamHandler() 13 | formatter = logging.Formatter( 14 | "%(asctime)s [%(levelname)s] %(name)s: %(message)s" 15 | ) 16 | handler.setFormatter(formatter) 17 | logger.addHandler(handler) 18 | 19 | logger.setLevel(settings.LOGLEVEL) 20 | 21 | # Ignore future warnings 22 | warnings.simplefilter(action="ignore", category=FutureWarning) 23 | 24 | # Set component loglevels 25 | logging.getLogger("PIL").setLevel(logging.ERROR) 26 | logging.getLogger("fontTools.subset").setLevel(logging.ERROR) 27 | logging.getLogger("fontTools.ttLib.ttFont").setLevel(logging.ERROR) 28 | logging.getLogger("weasyprint").setLevel(logging.CRITICAL) 29 | 30 | 31 | def get_logger(): 32 | return logging.getLogger("marker") 33 | ``` -------------------------------------------------------------------------------- /tests/processors/test_table_merge.py: -------------------------------------------------------------------------------- ```python 1 | from unittest.mock import Mock 2 | 3 | import pytest 4 | 5 | from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor 6 | from marker.processors.table import TableProcessor 7 | from marker.schema import BlockTypes 8 | 9 | 10 | @pytest.mark.filename("table_ex2.pdf") 11 | def test_llm_table_processor_nomerge(pdf_document, table_rec_model, recognition_model, detection_model, mocker): 12 | mock_cls = Mock() 13 | mock_cls.return_value = { 14 | "merge": "true", 15 | "direction": "right" 16 | } 17 | 18 | cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model) 19 | cell_processor(pdf_document) 20 | 21 | tables = pdf_document.contained_blocks((BlockTypes.Table,)) 22 | assert len(tables) == 3 23 | 24 | processor = LLMTableMergeProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"}) 25 | processor(pdf_document) 26 | 27 | tables = pdf_document.contained_blocks((BlockTypes.Table,)) 28 | assert len(tables) == 3 ``` -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: Python package 2 | on: 3 | push: 4 | tags: 5 | - "v*.*.*" 6 | jobs: 7 | build: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v3 11 | - name: Set up Python 3.11 12 | uses: actions/setup-python@v4 13 | with: 14 | python-version: 3.11 15 | - name: Install python dependencies 16 | run: | 17 | pip install poetry 18 | poetry install --extras "full" 19 | - name: Build package 20 | run: | 21 | poetry build 22 | - name: Extract version from pyproject.toml 23 | id: version 24 | run: | 25 | VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['tool']['poetry']['version'])") 26 | echo "version=v$VERSION" >> $GITHUB_OUTPUT 27 | - name: Publish package 28 | env: 29 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} 30 | run: | 31 | poetry config pypi-token.pypi "$PYPI_TOKEN" 32 | poetry publish 33 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/sectionheader.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Optional 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.blocks import Block 5 | 6 | 7 | class SectionHeader(Block): 8 | block_type: BlockTypes = BlockTypes.SectionHeader 9 | heading_level: Optional[int] = None 10 | block_description: str = "The header of a section of text or other blocks." 11 | html: str | None = None 12 | 13 | def assemble_html( 14 | self, document, child_blocks, parent_structure, block_config=None 15 | ): 16 | if self.ignore_for_output: 17 | return "" 18 | 19 | if self.html: 20 | return super().handle_html_output( 21 | document, child_blocks, parent_structure, block_config 22 | ) 23 | 24 | template = super().assemble_html( 25 | document, child_blocks, parent_structure, block_config 26 | ) 27 | template = template.replace("\n", " ") 28 | tag = f"h{self.heading_level}" if self.heading_level else "h2" 29 | return f"<{tag}>{template}</{tag}>" 30 | ``` -------------------------------------------------------------------------------- /tests/renderers/test_chunk_renderer.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.renderers.chunk import ChunkRenderer 4 | 5 | 6 | @pytest.mark.config({"page_range": [0]}) 7 | def test_chunk_renderer(pdf_document): 8 | renderer = ChunkRenderer() 9 | chunk_output = renderer(pdf_document) 10 | blocks = chunk_output.blocks 11 | page_info = chunk_output.page_info 12 | 13 | assert len(blocks) == 14 14 | assert blocks[0].block_type == "SectionHeader" 15 | assert page_info[0]["bbox"] is not None 16 | assert page_info[0]["polygon"] is not None 17 | 18 | figure_groups = [block for block in blocks if block.block_type == "FigureGroup"] 19 | figures = [block for block in blocks if block.block_type == "Figure"] 20 | captions = [block for block in blocks if block.block_type == "Caption"] 21 | 22 | assert len(figure_groups) == 1 23 | assert len(figures) == 0 24 | assert len(captions) == 0 25 | 26 | figure_group = figure_groups[0] 27 | assert figure_group.images is not None 28 | assert len(figure_group.images) == 1 29 | assert "<img src='/page/0/Figure/9'>" in figure_group.html ``` -------------------------------------------------------------------------------- /marker/providers/html.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | 4 | from marker.providers.pdf import PdfProvider 5 | 6 | 7 | class HTMLProvider(PdfProvider): 8 | def __init__(self, filepath: str, config=None): 9 | temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") 10 | self.temp_pdf_path = temp_pdf.name 11 | temp_pdf.close() 12 | 13 | # Convert HTML to PDF 14 | try: 15 | self.convert_html_to_pdf(filepath) 16 | except Exception as e: 17 | raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}") 18 | 19 | # Initialize the PDF provider with the temp pdf path 20 | super().__init__(self.temp_pdf_path, config) 21 | 22 | def __del__(self): 23 | if os.path.exists(self.temp_pdf_path): 24 | os.remove(self.temp_pdf_path) 25 | 26 | def convert_html_to_pdf(self, filepath: str): 27 | from weasyprint import HTML 28 | 29 | font_css = self.get_font_css() 30 | HTML(filename=filepath, encoding="utf-8").write_pdf( 31 | self.temp_pdf_path, stylesheets=[font_css] 32 | ) 33 | ``` -------------------------------------------------------------------------------- /data/latex_to_md.sh: -------------------------------------------------------------------------------- ```bash 1 | #!/bin/bash 2 | 3 | # List all .tex files in the latex folder 4 | FILES=$(find latex -name "*.tex") 5 | 6 | for f in $FILES 7 | do 8 | echo "Processing $f file..." 9 | base_name=$(basename "$f" .tex) 10 | out_file="references/${base_name}.md" 11 | 12 | pandoc --wrap=none \ 13 | --no-highlight \ 14 | --strip-comments \ 15 | --from=latex \ 16 | --to=commonmark_x+pipe_tables \ 17 | "$f" \ 18 | -o "$out_file" 19 | # Replace non-breaking spaces 20 | sed -i .bak 's/ / /g' "$out_file" 21 | sed -i .bak 's/ / /g' "$out_file" 22 | sed -i .bak 's/ / /g' "$out_file" 23 | sed -i .bak 's/ / /g' "$out_file" 24 | sed -i.bak -E 's/`\\cite`//g; s/<[^>]*>//g; s/\{[^}]*\}//g; s/\\cite\{[^}]*\}//g' "$out_file" 25 | sed -i.bak -E ' 26 | s/`\\cite`//g; # Remove \cite commands inside backticks 27 | s/::: //g; # Remove the leading ::: for content markers 28 | s/\[//g; # Remove opening square bracket 29 | s/\]//g; # Remove closing square bracket 30 | ' "$out_file" 31 | # Remove .bak file 32 | rm "$out_file.bak" 33 | done 34 | 35 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/figure.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Figure(Block): 6 | block_type: BlockTypes = BlockTypes.Figure 7 | description: str | None = None 8 | html: str | None = None 9 | block_description: str = "A chart or other image that contains data." 10 | 11 | def assemble_html( 12 | self, document, child_blocks, parent_structure, block_config=None 13 | ): 14 | if self.html: 15 | return super().handle_html_output( 16 | document, child_blocks, parent_structure, block_config 17 | ) 18 | 19 | child_ref_blocks = [ 20 | block 21 | for block in child_blocks 22 | if block.id.block_type == BlockTypes.Reference 23 | ] 24 | html = super().assemble_html( 25 | document, child_ref_blocks, parent_structure, block_config 26 | ) 27 | if self.description: 28 | html += f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>" 29 | return html 30 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/equation.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Equation(Block): 6 | block_type: BlockTypes = BlockTypes.Equation 7 | html: str | None = None 8 | block_description: str = "A block math equation." 9 | 10 | def assemble_html( 11 | self, document, child_blocks, parent_structure=None, block_config=None 12 | ): 13 | if self.html: 14 | child_ref_blocks = [ 15 | block 16 | for block in child_blocks 17 | if block.id.block_type == BlockTypes.Reference 18 | ] 19 | html_out = super().assemble_html( 20 | document, child_ref_blocks, parent_structure, block_config 21 | ) 22 | html_out += f"""<p block-type='{self.block_type}'>{self.html}</p>""" 23 | return html_out 24 | else: 25 | template = super().assemble_html( 26 | document, child_blocks, parent_structure, block_config 27 | ) 28 | return f"<p block-type='{self.block_type}'>{template}</p>" 29 | ``` -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/output-bug-report.md: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | name: Output bug report 3 | about: Create a report about poor output quality 4 | title: "[BUG: Output]" 5 | labels: 'bug: output' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## 📝 Describe the Output Issue 11 | 12 | A clear and concise description of the incorrect or unexpected output. 13 | 14 | ## 📄 Input Document 15 | 16 | Attach the PDF or input file used. 17 | 18 | ## 📤 Current Output 19 | 20 | Paste the Markdown or HTML that Marker generated: 21 | 22 | ````markdown 23 | Paste output here 24 | ````` 25 | 26 | ## ✅ Expected Output 27 | 28 | Describe or paste what you expected Marker to generate. 29 | 30 | ## ⚙️ Environment 31 | 32 | Please fill in all relevant details: 33 | 34 | * **Marker version**: 35 | * **Surya version**: 36 | * **Python version**: 37 | * **PyTorch version**: 38 | * **Transformers version**: 39 | * **Operating System**: 40 | 41 | ## 📟 Command or Code Used 42 | 43 | Paste the **exact bash command** or **Python code** you used to run Marker: 44 | 45 | <details> 46 | <summary>Click to expand</summary> 47 | 48 | ```bash 49 | # or Python code block 50 | your_command_here --with-flags 51 | ``` 52 | 53 | </details> 54 | 55 | ## 📎 Additional Context 56 | 57 | Any other relevant info, configs, or assumptions. 58 | ``` -------------------------------------------------------------------------------- /tests/converters/test_table_converter.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | from marker.converters.table import TableConverter 3 | from marker.renderers.markdown import MarkdownOutput 4 | from marker.util import classes_to_strings 5 | 6 | def _table_converter(config, model_dict, renderer, temp_pdf): 7 | converter = TableConverter( 8 | artifact_dict=model_dict, 9 | processor_list=None, 10 | renderer=classes_to_strings([renderer])[0], 11 | config=config 12 | ) 13 | 14 | markdown_output: MarkdownOutput = converter(temp_pdf.name) 15 | markdown = markdown_output.markdown 16 | 17 | assert len(markdown) > 0 18 | assert "cyclic" in markdown 19 | 20 | 21 | @pytest.mark.output_format("markdown") 22 | @pytest.mark.config({"page_range": [5]}) 23 | def test_table_converter(config, model_dict, renderer, temp_doc): 24 | _table_converter(config, model_dict, renderer, temp_doc) 25 | 26 | @pytest.mark.output_format("markdown") 27 | @pytest.mark.config({"page_range": [5], "force_ocr": True}) 28 | def test_table_converter_ocr(config, model_dict, renderer, temp_doc): 29 | _table_converter(config, model_dict, renderer, temp_doc) 30 | 31 | ``` -------------------------------------------------------------------------------- /tests/builders/test_blank_page.py: -------------------------------------------------------------------------------- ```python 1 | from surya.layout.schema import LayoutResult 2 | 3 | from marker.builders.document import DocumentBuilder 4 | from marker.builders.layout import LayoutBuilder 5 | from marker.builders.line import LineBuilder 6 | 7 | 8 | def test_blank_page(config, doc_provider, layout_model, ocr_error_model, detection_model): 9 | layout_builder = LayoutBuilder(layout_model, config) 10 | line_builder = LineBuilder(detection_model, ocr_error_model) 11 | builder = DocumentBuilder(config) 12 | document = builder.build_document(doc_provider) 13 | 14 | layout_results = [LayoutResult( 15 | bboxes=[], 16 | image_bbox=p.polygon.bbox, 17 | ) for p in document.pages] 18 | provider_lines = {p.page_id: [] for p in document.pages} 19 | ocr_lines = {p.page_id: [] for p in document.pages} 20 | 21 | layout_builder.add_blocks_to_pages(document.pages, layout_results) 22 | line_builder.merge_blocks(document, provider_lines, ocr_lines) 23 | 24 | assert all([isinstance(p.children, list) for p in document.pages]) 25 | assert all([isinstance(p.structure, list) for p in document.pages]) ``` -------------------------------------------------------------------------------- /marker/schema/blocks/complexregion.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class ComplexRegion(Block): 6 | block_type: BlockTypes = BlockTypes.ComplexRegion 7 | html: str | None = None 8 | block_description: str = "A complex region that can consist of multiple different types of blocks mixed with images. This block is chosen when it is difficult to categorize the region as a single block type." 9 | 10 | def assemble_html(self, document, child_blocks, parent_structure, block_config): 11 | if self.html: 12 | child_ref_blocks = [ 13 | block 14 | for block in child_blocks 15 | if block.id.block_type == BlockTypes.Reference 16 | ] 17 | html = super().assemble_html( 18 | document, child_ref_blocks, parent_structure, block_config 19 | ) 20 | return html + self.html 21 | else: 22 | template = super().assemble_html( 23 | document, child_blocks, parent_structure, block_config 24 | ) 25 | return f"<p>{template}</p>" 26 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/picture.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Picture(Block): 6 | block_type: BlockTypes = BlockTypes.Picture 7 | description: str | None = None 8 | block_description: str = "An image block that represents a picture." 9 | html: str | None = None 10 | 11 | def assemble_html( 12 | self, document, child_blocks, parent_structure, block_config=None 13 | ): 14 | if self.html: 15 | return super().handle_html_output( 16 | document, child_blocks, parent_structure, block_config 17 | ) 18 | 19 | child_ref_blocks = [ 20 | block 21 | for block in child_blocks 22 | if block.id.block_type == BlockTypes.Reference 23 | ] 24 | html = super().assemble_html( 25 | document, child_ref_blocks, parent_structure, block_config 26 | ) 27 | 28 | if self.description: 29 | return ( 30 | html 31 | + f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>" 32 | ) 33 | return html 34 | ``` -------------------------------------------------------------------------------- /.github/workflows/benchmarks.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: Integration test 2 | 3 | on: [push] 4 | 5 | env: 6 | PYTHONIOENCODING: "utf-8" 7 | 8 | jobs: 9 | benchmark: 10 | runs-on: ${{ matrix.os }} 11 | strategy: 12 | matrix: 13 | os: [t4_gpu, ubuntu-latest] 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: Set up Python 3.11 17 | uses: actions/setup-python@v4 18 | with: 19 | python-version: 3.11 20 | - name: Install apt dependencies 21 | run: | 22 | sudo apt-get update 23 | sudo apt-get install -y pandoc 24 | - name: Install python dependencies 25 | run: | 26 | pip install poetry 27 | poetry install --extras "full" 28 | - name: Run benchmark test 29 | run: | 30 | poetry run python benchmarks/overall/overall.py --max_rows 5 31 | poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/result.json --type marker 32 | - name: Run table benchmark 33 | run: | 34 | poetry run python benchmarks/table/table.py --max_rows 5 35 | poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table ``` -------------------------------------------------------------------------------- /marker/services/vertex.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Annotated 2 | 3 | from google import genai 4 | 5 | from marker.services.gemini import BaseGeminiService 6 | 7 | class GoogleVertexService(BaseGeminiService): 8 | vertex_project_id: Annotated[ 9 | str, 10 | "Google Cloud Project ID for Vertex AI.", 11 | ] = None 12 | vertex_location: Annotated[ 13 | str, 14 | "Google Cloud Location for Vertex AI.", 15 | ] = "us-central1" 16 | gemini_model_name: Annotated[ 17 | str, 18 | "The name of the Google model to use for the service." 19 | ] = "gemini-2.0-flash-001" 20 | vertex_dedicated: Annotated[ 21 | bool, 22 | "Whether to use a dedicated Vertex AI instance." 23 | ] = False 24 | 25 | def get_google_client(self, timeout: int): 26 | http_options = {"timeout": timeout * 1000} # Convert to milliseconds 27 | if self.vertex_dedicated: 28 | http_options["headers"] = {"x-vertex-ai-llm-request-type": "dedicated"} 29 | return genai.Client( 30 | vertexai=True, 31 | project=self.vertex_project_id, 32 | location=self.vertex_location, 33 | http_options=http_options, 34 | ) ``` -------------------------------------------------------------------------------- /marker/scripts/chunk_convert.sh: -------------------------------------------------------------------------------- ```bash 1 | #!/bin/bash 2 | 3 | trap 'pkill -P $$' SIGINT 4 | 5 | # Check if NUM_DEVICES is set 6 | if [[ -z "$NUM_DEVICES" ]]; then 7 | echo "Please set the NUM_DEVICES environment variable." 8 | exit 1 9 | fi 10 | 11 | if [[ -z "$NUM_WORKERS" ]]; then 12 | echo "Please set the NUM_WORKERS environment variable." 13 | exit 1 14 | fi 15 | 16 | # Get input folder and output folder from args 17 | if [[ -z "$1" ]]; then 18 | echo "Please provide an input folder." 19 | exit 1 20 | fi 21 | 22 | if [[ -z "$2" ]]; then 23 | echo "Please provide an output folder." 24 | exit 1 25 | fi 26 | 27 | INPUT_FOLDER=$1 28 | OUTPUT_FOLDER=$2 29 | 30 | # Ensure output folder exists 31 | mkdir -p "$OUTPUT_FOLDER" 32 | 33 | # Loop from 0 to NUM_DEVICES and run the marker command in parallel 34 | for (( i=0; i<$NUM_DEVICES; i++ )); do 35 | DEVICE_NUM=$i 36 | export DEVICE_NUM 37 | export NUM_DEVICES 38 | export NUM_WORKERS 39 | echo "Running marker on GPU $DEVICE_NUM" 40 | cmd="CUDA_VISIBLE_DEVICES=$DEVICE_NUM marker $INPUT_FOLDER --output_dir $OUTPUT_FOLDER --num_chunks $NUM_DEVICES --chunk_idx $DEVICE_NUM --workers $NUM_WORKERS" 41 | eval $cmd & 42 | 43 | sleep 5 44 | done 45 | 46 | # Wait for all background processes to finish 47 | wait ``` -------------------------------------------------------------------------------- /marker/schema/blocks/__init__.py: -------------------------------------------------------------------------------- ```python 1 | from __future__ import annotations 2 | 3 | from marker.schema.blocks.base import Block, BlockId, BlockOutput 4 | from marker.schema.blocks.caption import Caption 5 | from marker.schema.blocks.code import Code 6 | from marker.schema.blocks.figure import Figure 7 | from marker.schema.blocks.footnote import Footnote 8 | from marker.schema.blocks.form import Form 9 | from marker.schema.blocks.equation import Equation 10 | from marker.schema.blocks.handwriting import Handwriting 11 | from marker.schema.blocks.inlinemath import InlineMath 12 | from marker.schema.blocks.listitem import ListItem 13 | from marker.schema.blocks.pagefooter import PageFooter 14 | from marker.schema.blocks.pageheader import PageHeader 15 | from marker.schema.blocks.picture import Picture 16 | from marker.schema.blocks.sectionheader import SectionHeader 17 | from marker.schema.blocks.table import Table 18 | from marker.schema.blocks.text import Text 19 | from marker.schema.blocks.toc import TableOfContents 20 | from marker.schema.blocks.complexregion import ComplexRegion 21 | from marker.schema.blocks.tablecell import TableCell 22 | from marker.schema.blocks.reference import Reference 23 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/tablecell.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.blocks import Block 5 | 6 | 7 | class TableCell(Block): 8 | block_type: BlockTypes = BlockTypes.TableCell 9 | rowspan: int 10 | colspan: int 11 | row_id: int 12 | col_id: int 13 | is_header: bool 14 | text_lines: List[str] | None = None 15 | block_description: str = "A cell in a table." 16 | 17 | @property 18 | def text(self): 19 | return "\n".join(self.text_lines) 20 | 21 | def assemble_html( 22 | self, document, child_blocks, parent_structure=None, block_config=None 23 | ): 24 | add_cell_id = block_config and block_config.get("add_block_ids", False) 25 | 26 | tag_cls = "th" if self.is_header else "td" 27 | tag = f"<{tag_cls}" 28 | if self.rowspan > 1: 29 | tag += f" rowspan={self.rowspan}" 30 | if self.colspan > 1: 31 | tag += f" colspan={self.colspan}" 32 | if add_cell_id: 33 | tag += f' data-block-id="{self.id}"' 34 | if self.text_lines is None: 35 | self.text_lines = [] 36 | text = "<br>".join(self.text_lines) 37 | return f"{tag}>{text}</{tag_cls}>" 38 | ``` -------------------------------------------------------------------------------- /benchmarks/verify_scores.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import argparse 3 | 4 | 5 | def verify_scores(file_path): 6 | with open(file_path, 'r') as file: 7 | data = json.load(file) 8 | 9 | raw_scores = [data["scores"][k] for k in data["scores"]] 10 | marker_scores = [r["marker"]["heuristic"]["score"] for r in raw_scores] 11 | marker_score = sum(marker_scores) / len(marker_scores) 12 | if marker_score < 90: 13 | raise ValueError("Marker score below 90") 14 | 15 | 16 | def verify_table_scores(file_path): 17 | with open(file_path, 'r') as file: 18 | data = json.load(file) 19 | 20 | avg = sum([r["marker_score"] for r in data["marker"]]) / len(data) 21 | if avg < 0.7: 22 | raise ValueError("Average score is below the required threshold of 0.7") 23 | 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser(description="Verify benchmark scores") 27 | parser.add_argument("file_path", type=str, help="Path to the json file") 28 | parser.add_argument("--type", type=str, help="Type of file to verify", default="marker") 29 | args = parser.parse_args() 30 | if args.type == "marker": 31 | verify_scores(args.file_path) 32 | elif args.type == "table": 33 | verify_table_scores(args.file_path) 34 | ``` -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/breaking-bug-report.md: -------------------------------------------------------------------------------- ```markdown 1 | --- 2 | name: Breaking bug report 3 | about: Create a report about a breaking bug 4 | title: "[BUG: Breaking]" 5 | labels: 'bug: breaking' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## 🧨 Describe the Bug 11 | 12 | A clear and concise description of the breaking issue (e.g., crash, OOM, exception, etc). 13 | 14 | ## 📄 Input Document 15 | 16 | Attach the PDF or input file that triggered the error. 17 | 18 | ## 📤 Output Trace / Stack Trace 19 | 20 | Paste the **complete** stack trace or error output, if available. 21 | 22 | <details> 23 | <summary>Click to expand</summary> 24 | 25 | ``` 26 | Paste stack trace here 27 | ``` 28 | 29 | </details> 30 | 31 | ## ⚙️ Environment 32 | 33 | Please fill in all relevant details: 34 | 35 | - **Marker version**: 36 | - **Surya version**: 37 | - **Python version**: 38 | - **PyTorch version**: 39 | - **Transformers version**: 40 | - **Operating System** (incl. container info if relevant): 41 | 42 | ## ✅ Expected Behavior 43 | 44 | What did you expect Marker to do? 45 | 46 | ## 📟 Command or Code Used 47 | 48 | Paste the **exact bash command** or **Python code** you used to run Marker: 49 | 50 | <details> 51 | <summary>Click to expand</summary> 52 | 53 | ```bash 54 | # or Python code block 55 | your_command_here --with-flags 56 | ``` 57 | 58 | </details> 59 | 60 | ## 📎 Additional Context 61 | 62 | Any other context that might help us debug this (e.g., CLI options, working directory, runtime settings). 63 | ``` -------------------------------------------------------------------------------- /marker/models.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | 3 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = ( 4 | "1" # Transformers uses .isin for an op, which is not supported on MPS 5 | ) 6 | 7 | from surya.foundation import FoundationPredictor 8 | from surya.detection import DetectionPredictor 9 | from surya.layout import LayoutPredictor 10 | from surya.ocr_error import OCRErrorPredictor 11 | from surya.recognition import RecognitionPredictor 12 | from surya.table_rec import TableRecPredictor 13 | from surya.settings import settings as surya_settings 14 | 15 | 16 | def create_model_dict( 17 | device=None, dtype=None, attention_implementation: str | None = None 18 | ) -> dict: 19 | return { 20 | "layout_model": LayoutPredictor(FoundationPredictor(checkpoint=surya_settings.LAYOUT_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)), 21 | "recognition_model": RecognitionPredictor(FoundationPredictor(checkpoint=surya_settings.RECOGNITION_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)), 22 | "table_rec_model": TableRecPredictor(device=device, dtype=dtype), 23 | "detection_model": DetectionPredictor(device=device, dtype=dtype), 24 | "ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype), 25 | } 26 | ``` -------------------------------------------------------------------------------- /marker/extractors/__init__.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Annotated, Sequence 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.document import Document 5 | from marker.schema.groups import PageGroup 6 | from PIL import Image 7 | 8 | from marker.services import BaseService 9 | from marker.util import assign_config 10 | 11 | 12 | class BaseExtractor: 13 | """ 14 | An extractor that uses a provided service to extract structured data from documents. 15 | """ 16 | 17 | max_concurrency: Annotated[ 18 | int, 19 | "The maximum number of concurrent requests to make to the Gemini model.", 20 | ] = 3 21 | disable_tqdm: Annotated[ 22 | bool, 23 | "Whether to disable the tqdm progress bar.", 24 | ] = False 25 | 26 | def __init__(self, llm_service: BaseService, config=None): 27 | assign_config(self, config) 28 | self.llm_service = llm_service 29 | 30 | def extract_image( 31 | self, 32 | document: Document, 33 | page: PageGroup, 34 | remove_blocks: Sequence[BlockTypes] | None = None, 35 | highres: bool = False, # Default False to save tokens 36 | ) -> Image.Image: 37 | return page.get_image( 38 | document, 39 | highres=highres, 40 | remove_blocks=remove_blocks, 41 | ) 42 | 43 | def __call__(self, document: Document, *args, **kwargs): 44 | raise NotImplementedError 45 | ``` -------------------------------------------------------------------------------- /marker/scripts/file_to_s3.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import shutil 3 | import datetime 4 | from pathlib import Path 5 | import boto3 6 | 7 | from huggingface_hub import snapshot_download 8 | 9 | import click 10 | 11 | S3_API_URL = "https://1afbe4656a6b40d982ab5e730a39f6b9.r2.cloudflarestorage.com" 12 | 13 | @click.command(help="Uploads files to an S3 bucket") 14 | @click.argument("filepath", type=str) 15 | @click.argument("s3_path", type=str) 16 | @click.option("--bucket_name", type=str, default="datalab") 17 | @click.option("--access_key_id", type=str, default="<access_key_id>") 18 | @click.option("--access_key_secret", type=str, default="<access_key_secret>") 19 | def main(filepath: str, s3_path: str, bucket_name: str, access_key_id: str, access_key_secret: str): 20 | filepath = Path(filepath) 21 | # Upload the files to S3 22 | s3_client = boto3.client( 23 | 's3', 24 | endpoint_url=S3_API_URL, 25 | aws_access_key_id=access_key_id, 26 | aws_secret_access_key=access_key_secret, 27 | region_name="enam" 28 | ) 29 | 30 | s3_key = f"{s3_path}/{filepath.name}" 31 | 32 | try: 33 | s3_client.upload_file( 34 | str(filepath), 35 | bucket_name, 36 | s3_key 37 | ) 38 | except Exception as e: 39 | print(f"Error uploading {filepath}: {str(e)}") 40 | 41 | print(f"Uploaded files to {s3_path}") 42 | 43 | if __name__ == "__main__": 44 | main() 45 | 46 | 47 | 48 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/marker.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | import time 4 | 5 | from benchmarks.overall.methods import BaseMethod, BenchmarkResult 6 | from marker.config.parser import ConfigParser 7 | from marker.converters.pdf import PdfConverter 8 | 9 | 10 | class MarkerMethod(BaseMethod): 11 | model_dict: dict = None 12 | use_llm: bool = False 13 | 14 | def __call__(self, sample) -> BenchmarkResult: 15 | pdf_bytes = sample["pdf"] # This is a single page PDF 16 | parser = ConfigParser({ 17 | "page_range": "0", 18 | "disable_tqdm": True, 19 | "use_llm": self.use_llm, 20 | "redo_inline_math": self.use_llm, 21 | "llm_service": "marker.services.vertex.GoogleVertexService", 22 | "vertex_project_id": os.getenv("VERTEX_PROJECT_ID"), 23 | }) 24 | 25 | block_converter = PdfConverter( 26 | artifact_dict=self.model_dict, 27 | config=parser.generate_config_dict(), 28 | llm_service=parser.get_llm_service() 29 | ) 30 | 31 | with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: 32 | f.write(pdf_bytes) 33 | start = time.time() 34 | rendered = block_converter(f.name) 35 | total = time.time() - start 36 | 37 | return { 38 | "markdown": rendered.markdown, 39 | "time": total 40 | } 41 | 42 | ``` -------------------------------------------------------------------------------- /marker/utils/image.py: -------------------------------------------------------------------------------- ```python 1 | from PIL import Image 2 | import numpy as np 3 | import cv2 4 | from typing import List, Optional 5 | 6 | def is_blank_image(image: Image.Image, polygon: Optional[List[List[int]]] = None) -> bool: 7 | image = np.asarray(image) 8 | if ( 9 | image is None 10 | or image.size == 0 11 | or image.shape[0] == 0 12 | or image.shape[1] == 0 13 | ): 14 | # Handle empty image case 15 | return True 16 | 17 | if polygon is not None: 18 | rounded_polys = [[int(corner[0]), int(corner[1])] for corner in polygon] 19 | if rounded_polys[0] == rounded_polys[1] and rounded_polys[2] == rounded_polys[3]: 20 | return True 21 | 22 | gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) 23 | gray = cv2.GaussianBlur(gray, (7, 7), 0) 24 | 25 | # Adaptive threshold (inverse for text as white) 26 | binarized = cv2.adaptiveThreshold( 27 | gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15 28 | ) 29 | 30 | num_labels, labels, stats, _ = cv2.connectedComponentsWithStats( 31 | binarized, connectivity=8 32 | ) 33 | cleaned = np.zeros_like(binarized) 34 | for i in range(1, num_labels): # skip background 35 | cleaned[labels == i] = 255 36 | 37 | kernel = np.ones((1, 5), np.uint8) 38 | dilated = cv2.dilate(cleaned, kernel, iterations=3) 39 | b = dilated / 255 40 | return bool(b.sum() == 0) ``` -------------------------------------------------------------------------------- /marker/schema/blocks/text.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class Text(Block): 6 | block_type: BlockTypes = BlockTypes.Text 7 | has_continuation: bool = False 8 | blockquote: bool = False 9 | blockquote_level: int = 0 10 | html: str | None = None 11 | block_description: str = "A paragraph or line of text." 12 | 13 | def assemble_html( 14 | self, document, child_blocks, parent_structure, block_config=None 15 | ): 16 | if self.ignore_for_output: 17 | return "" 18 | 19 | # This happens when we used an llm processor 20 | if self.html: 21 | return super().handle_html_output( 22 | document, child_blocks, parent_structure, block_config 23 | ) 24 | 25 | template = super().assemble_html( 26 | document, child_blocks, parent_structure, block_config 27 | ) 28 | template = template.replace("\n", " ") 29 | 30 | el_attr = f" block-type='{self.block_type}'" 31 | if self.has_continuation: 32 | el_attr += " class='has-continuation'" 33 | 34 | if self.blockquote: 35 | blockquote_prefix = "<blockquote>" * self.blockquote_level 36 | blockquote_suffix = "</blockquote>" * self.blockquote_level 37 | return f"{blockquote_prefix}<p{el_attr}>{template}</p>{blockquote_suffix}" 38 | else: 39 | return f"<p{el_attr}>{template}</p>" 40 | ``` -------------------------------------------------------------------------------- /.github/workflows/cla.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: "Marker CLA Assistant" 2 | on: 3 | issue_comment: 4 | types: [created] 5 | pull_request_target: 6 | types: [opened,closed,synchronize] 7 | 8 | # explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings 9 | permissions: 10 | actions: write 11 | contents: write 12 | pull-requests: write 13 | statuses: write 14 | 15 | jobs: 16 | CLAAssistant: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: "Marker CLA Assistant" 20 | if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target' 21 | uses: contributor-assistant/[email protected] 22 | env: 23 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 24 | # the below token should have repo scope and must be manually added by you in the repository's secret 25 | # This token is required only if you have configured to store the signatures in a remote repository/organization 26 | PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} 27 | with: 28 | path-to-signatures: 'signatures/version1/cla.json' 29 | path-to-document: 'https://github.com/VikParuchuri/marker/blob/master/CLA.md' 30 | # branch should not be protected 31 | branch: 'master' 32 | allowlist: VikParuchuri,Sandy ``` -------------------------------------------------------------------------------- /tests/builders/test_overriding.py: -------------------------------------------------------------------------------- ```python 1 | import multiprocessing as mp 2 | 3 | import pytest 4 | 5 | from marker.providers.pdf import PdfProvider 6 | from marker.schema import BlockTypes 7 | from marker.schema.blocks import SectionHeader 8 | from marker.schema.document import Document 9 | from marker.schema.registry import register_block_class 10 | from marker.schema.text import Line 11 | from tests.utils import setup_pdf_provider 12 | 13 | 14 | class NewSectionHeader(SectionHeader): 15 | pass 16 | 17 | 18 | class NewLine(Line): 19 | pass 20 | 21 | 22 | @pytest.mark.config({ 23 | "page_range": [0], 24 | "override_map": {BlockTypes.SectionHeader: NewSectionHeader} 25 | }) 26 | def test_overriding(pdf_document: Document): 27 | assert pdf_document.pages[0]\ 28 | .get_block(pdf_document.pages[0].structure[0]).__class__ == NewSectionHeader 29 | 30 | 31 | def get_lines(pdf: str, config=None): 32 | for block_type, block_cls in config["override_map"].items(): 33 | register_block_class(block_type, block_cls) 34 | 35 | provider: PdfProvider = setup_pdf_provider(pdf, config) 36 | return provider.get_page_lines(0) 37 | 38 | 39 | def test_overriding_mp(): 40 | config = { 41 | "page_range": [0], 42 | "override_map": {BlockTypes.Line: NewLine} 43 | } 44 | 45 | pdf_list = ["adversarial.pdf", "adversarial_rot.pdf"] 46 | 47 | with mp.Pool(processes=2) as pool: 48 | results = pool.starmap(get_lines, [(pdf, config) for pdf in pdf_list]) 49 | assert all([r[0].line.__class__ == NewLine for r in results]) 50 | ``` -------------------------------------------------------------------------------- /.github/workflows/scripts.yml: -------------------------------------------------------------------------------- ```yaml 1 | name: Test CLI scripts 2 | 3 | on: [push] 4 | 5 | jobs: 6 | tests: 7 | runs-on: t4_gpu 8 | steps: 9 | - uses: actions/checkout@v3 10 | - name: Set up Python 3.11 11 | uses: actions/setup-python@v4 12 | with: 13 | python-version: 3.11 14 | - name: Install python dependencies 15 | run: | 16 | pip install poetry 17 | poetry install --extras "full" 18 | - name: Download benchmark data 19 | run: | 20 | wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi" 21 | unzip -o benchmark_data.zip 22 | - name: Test single script 23 | run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 24 | - name: Test convert script 25 | run: poetry run marker benchmark_data/pdfs --max_files 1 --page_range 0 26 | - name: Text convert script multiple workers 27 | run: poetry run marker benchmark_data/pdfs --max_files 2 --page_range 0-5 28 | - name: Test llm option 29 | run: | 30 | poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 --use_llm > output.txt || echo "Command failed but continuing" 31 | if ! grep -q "UserWarning" output.txt; then 32 | echo "Success: No UserWarning found" 33 | exit 0 34 | else 35 | echo "Error: UserWarning found in output" 36 | exit 1 37 | fi ``` -------------------------------------------------------------------------------- /marker/processors/footnote.py: -------------------------------------------------------------------------------- ```python 1 | import re 2 | 3 | from marker.processors import BaseProcessor 4 | from marker.schema import BlockTypes 5 | from marker.schema.document import Document 6 | from marker.schema.groups import PageGroup 7 | 8 | 9 | class FootnoteProcessor(BaseProcessor): 10 | """ 11 | A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks. 12 | """ 13 | block_types = (BlockTypes.Footnote,) 14 | 15 | def __call__(self, document: Document): 16 | for page in document.pages: 17 | self.push_footnotes_to_bottom(page, document) 18 | self.assign_superscripts(page, document) 19 | 20 | def push_footnotes_to_bottom(self, page: PageGroup, document: Document): 21 | footnote_blocks = page.contained_blocks(document, self.block_types) 22 | 23 | # Push footnotes to the bottom 24 | for block in footnote_blocks: 25 | # Check if it is top-level 26 | if block.id in page.structure: 27 | # Move to bottom if it is 28 | page.structure.remove(block.id) 29 | page.add_structure(block) 30 | 31 | def assign_superscripts(self, page: PageGroup, document: Document): 32 | footnote_blocks = page.contained_blocks(document, self.block_types) 33 | 34 | for block in footnote_blocks: 35 | for span in block.contained_blocks(document, (BlockTypes.Span,)): 36 | if re.match(r"^[0-9\W]+", span.text): 37 | span.has_superscript = True 38 | break 39 | ``` -------------------------------------------------------------------------------- /marker/scripts/convert_single.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | 3 | os.environ["GRPC_VERBOSITY"] = "ERROR" 4 | os.environ["GLOG_minloglevel"] = "2" 5 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = ( 6 | "1" # Transformers uses .isin for a simple op, which is not supported on MPS 7 | ) 8 | 9 | import time 10 | import click 11 | 12 | from marker.config.parser import ConfigParser 13 | from marker.config.printer import CustomClickPrinter 14 | from marker.logger import configure_logging, get_logger 15 | from marker.models import create_model_dict 16 | from marker.output import save_output 17 | 18 | configure_logging() 19 | logger = get_logger() 20 | 21 | 22 | @click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.") 23 | @click.argument("fpath", type=str) 24 | @ConfigParser.common_options 25 | def convert_single_cli(fpath: str, **kwargs): 26 | models = create_model_dict() 27 | start = time.time() 28 | config_parser = ConfigParser(kwargs) 29 | 30 | converter_cls = config_parser.get_converter_cls() 31 | converter = converter_cls( 32 | config=config_parser.generate_config_dict(), 33 | artifact_dict=models, 34 | processor_list=config_parser.get_processors(), 35 | renderer=config_parser.get_renderer(), 36 | llm_service=config_parser.get_llm_service(), 37 | ) 38 | rendered = converter(fpath) 39 | out_folder = config_parser.get_output_folder(fpath) 40 | save_output(rendered, out_folder, config_parser.get_base_filename(fpath)) 41 | 42 | logger.info(f"Saved markdown to {out_folder}") 43 | logger.info(f"Total time: {time.time() - start}") 44 | ``` -------------------------------------------------------------------------------- /marker/schema/blocks/inlinemath.py: -------------------------------------------------------------------------------- ```python 1 | from marker.schema import BlockTypes 2 | from marker.schema.blocks import Block 3 | 4 | 5 | class InlineMath(Block): 6 | block_type: BlockTypes = BlockTypes.TextInlineMath 7 | has_continuation: bool = False 8 | blockquote: bool = False 9 | blockquote_level: int = 0 10 | block_description: str = "A text block that contains inline math. This is not used for italic text or references - only for text that contains math." 11 | html: str | None = None 12 | 13 | def assemble_html( 14 | self, document, child_blocks, parent_structure, block_config=None 15 | ): 16 | if self.ignore_for_output: 17 | return "" 18 | 19 | if self.html: 20 | return super().handle_html_output( 21 | document, child_blocks, parent_structure, block_config 22 | ) 23 | 24 | template = super().assemble_html( 25 | document, child_blocks, parent_structure, block_config 26 | ) 27 | template = template.replace("\n", " ") 28 | 29 | el_attr = f" block-type='{self.block_type}'" 30 | if self.has_continuation: 31 | el_attr += " class='has-continuation'" 32 | 33 | if self.blockquote: 34 | # Add indentation for blockquote levels 35 | blockquote_prefix = "<blockquote>" * self.blockquote_level 36 | blockquote_suffix = "</blockquote>" * self.blockquote_level 37 | return f"{blockquote_prefix}<p{el_attr}>{template}</p>{blockquote_suffix}" 38 | else: 39 | return f"<p{el_attr}>{template}</p>" 40 | ``` -------------------------------------------------------------------------------- /tests/builders/test_layout_replace.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.builders.document import DocumentBuilder 4 | from marker.builders.layout import LayoutBuilder 5 | from marker.builders.line import LineBuilder 6 | from marker.renderers.markdown import MarkdownRenderer 7 | from marker.schema import BlockTypes 8 | from marker.schema.registry import get_block_class 9 | 10 | 11 | @pytest.mark.filename("thinkpython.pdf") 12 | @pytest.mark.config({"page_range": [0]}) 13 | def test_layout_replace( 14 | request, config, doc_provider, layout_model, ocr_error_model, detection_model 15 | ): 16 | # The llm layout builder replaces blocks - this makes sure text is still merged properly 17 | layout_builder = LayoutBuilder(layout_model, config) 18 | line_builder = LineBuilder(detection_model, ocr_error_model, config) 19 | builder = DocumentBuilder(config) 20 | document = builder.build_document(doc_provider) 21 | layout_builder(document, doc_provider) 22 | page = document.pages[0] 23 | new_blocks = [] 24 | for block in page.contained_blocks(document, (BlockTypes.Text,)): 25 | generated_block_class = get_block_class(BlockTypes.TextInlineMath) 26 | generated_block = generated_block_class( 27 | polygon=block.polygon, 28 | page_id=block.page_id, 29 | structure=block.structure, 30 | ) 31 | page.replace_block(block, generated_block) 32 | new_blocks.append(generated_block) 33 | line_builder(document, doc_provider) 34 | 35 | for block in new_blocks: 36 | assert block.raw_text(document).strip() 37 | 38 | renderer = MarkdownRenderer(config) 39 | rendered = renderer(document) 40 | 41 | assert "Think Python" in rendered.markdown 42 | ``` -------------------------------------------------------------------------------- /marker/processors/code.py: -------------------------------------------------------------------------------- ```python 1 | from marker.processors import BaseProcessor 2 | from marker.schema import BlockTypes 3 | from marker.schema.blocks import Code 4 | from marker.schema.document import Document 5 | 6 | 7 | class CodeProcessor(BaseProcessor): 8 | """ 9 | A processor for formatting code blocks. 10 | """ 11 | block_types = (BlockTypes.Code, ) 12 | 13 | def __call__(self, document: Document): 14 | for page in document.pages: 15 | for block in page.contained_blocks(document, self.block_types): 16 | self.format_block(document, block) 17 | 18 | 19 | def format_block(self, document: Document, block: Code): 20 | min_left = 9999 # will contain x- coord of column 0 21 | total_width = 0 22 | total_chars = 0 23 | 24 | contained_lines = block.contained_blocks(document, (BlockTypes.Line,)) 25 | for line in contained_lines: 26 | min_left = min(line.polygon.bbox[0], min_left) 27 | total_width += line.polygon.width 28 | total_chars += len(line.raw_text(document)) 29 | 30 | avg_char_width = total_width / max(total_chars, 1) 31 | code_text = "" 32 | is_new_line = False 33 | for line in contained_lines: 34 | text = line.raw_text(document) 35 | if avg_char_width == 0: 36 | prefix = "" 37 | else: 38 | total_spaces = int((line.polygon.bbox[0] - min_left) / avg_char_width) 39 | prefix = " " * max(0, total_spaces) 40 | 41 | if is_new_line: 42 | text = prefix + text 43 | 44 | code_text += text 45 | is_new_line = text.endswith("\n") 46 | 47 | block.code = code_text.rstrip() 48 | ``` -------------------------------------------------------------------------------- /marker/settings.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Optional 2 | 3 | from dotenv import find_dotenv 4 | from pydantic import computed_field 5 | from pydantic_settings import BaseSettings 6 | import torch 7 | import os 8 | 9 | 10 | class Settings(BaseSettings): 11 | # Paths 12 | BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 13 | OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results") 14 | FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts") 15 | DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data") 16 | ARTIFACT_URL: str = "https://models.datalab.to/artifacts" 17 | FONT_NAME: str = "GoNotoCurrent-Regular.ttf" 18 | FONT_PATH: str = os.path.join(FONT_DIR, FONT_NAME) 19 | LOGLEVEL: str = "INFO" 20 | 21 | # General 22 | OUTPUT_ENCODING: str = "utf-8" 23 | OUTPUT_IMAGE_FORMAT: str = "JPEG" 24 | 25 | # LLM 26 | GOOGLE_API_KEY: Optional[str] = "" 27 | 28 | # General models 29 | TORCH_DEVICE: Optional[str] = ( 30 | None # Note: MPS device does not work for text detection, and will default to CPU 31 | ) 32 | 33 | @computed_field 34 | @property 35 | def TORCH_DEVICE_MODEL(self) -> str: 36 | if self.TORCH_DEVICE is not None: 37 | return self.TORCH_DEVICE 38 | 39 | if torch.cuda.is_available(): 40 | return "cuda" 41 | 42 | if torch.backends.mps.is_available(): 43 | return "mps" 44 | 45 | return "cpu" 46 | 47 | @computed_field 48 | @property 49 | def MODEL_DTYPE(self) -> torch.dtype: 50 | if self.TORCH_DEVICE_MODEL == "cuda": 51 | return torch.bfloat16 52 | else: 53 | return torch.float32 54 | 55 | class Config: 56 | env_file = find_dotenv("local.env") 57 | extra = "ignore" 58 | 59 | 60 | settings = Settings() 61 | ``` -------------------------------------------------------------------------------- /marker/converters/ocr.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Tuple 2 | 3 | from marker.builders.document import DocumentBuilder 4 | from marker.builders.line import LineBuilder 5 | from marker.builders.ocr import OcrBuilder 6 | from marker.converters.pdf import PdfConverter 7 | from marker.processors import BaseProcessor 8 | from marker.processors.equation import EquationProcessor 9 | from marker.providers.registry import provider_from_filepath 10 | from marker.renderers.ocr_json import OCRJSONRenderer 11 | 12 | 13 | class OCRConverter(PdfConverter): 14 | default_processors: Tuple[BaseProcessor, ...] = (EquationProcessor,) 15 | 16 | def __init__(self, *args, **kwargs): 17 | super().__init__(*args, **kwargs) 18 | 19 | if not self.config: 20 | self.config = {} 21 | 22 | self.config["force_ocr"] = True 23 | self.renderer = OCRJSONRenderer 24 | 25 | def build_document(self, filepath: str): 26 | provider_cls = provider_from_filepath(filepath) 27 | layout_builder = self.resolve_dependencies(self.layout_builder_class) 28 | line_builder = self.resolve_dependencies(LineBuilder) 29 | ocr_builder = self.resolve_dependencies(OcrBuilder) 30 | document_builder = DocumentBuilder(self.config) 31 | 32 | provider = provider_cls(filepath, self.config) 33 | document = document_builder(provider, layout_builder, line_builder, ocr_builder) 34 | 35 | for processor in self.processor_list: 36 | processor(document) 37 | 38 | return document 39 | 40 | def __call__(self, filepath: str): 41 | document = self.build_document(filepath) 42 | self.page_count = len(document.pages) 43 | renderer = self.resolve_dependencies(self.renderer) 44 | return renderer(document) 45 | ``` -------------------------------------------------------------------------------- /tests/builders/test_document_builder.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.text.line import Line 5 | 6 | 7 | @pytest.mark.filename("thinkpython.pdf") 8 | @pytest.mark.config({"page_range": [0]}) 9 | def test_document_builder(pdf_document): 10 | first_page = pdf_document.pages[0] 11 | assert first_page.structure[0] == "/page/0/SectionHeader/0" 12 | 13 | first_block = first_page.get_block(first_page.structure[0]) 14 | assert first_block.block_type == BlockTypes.SectionHeader 15 | assert first_block.text_extraction_method == "pdftext" 16 | 17 | first_text_block: Line = first_page.get_block(first_block.structure[0]) 18 | assert first_text_block.block_type == BlockTypes.Line 19 | 20 | first_span = first_page.get_block(first_text_block.structure[0]) 21 | assert first_span.block_type == BlockTypes.Span 22 | assert first_span.text == "Think Python" 23 | assert first_span.font == "URWPalladioL-Roma" 24 | assert first_span.formats == ["plain"] 25 | 26 | 27 | @pytest.mark.config({"page_range": [0]}) 28 | def test_document_builder_inline_eq(pdf_document): 29 | first_page = pdf_document.pages[0] 30 | assert first_page.structure[0] == "/page/0/SectionHeader/0" 31 | 32 | first_block = first_page.get_block(first_page.structure[0]) 33 | assert first_block.block_type == BlockTypes.SectionHeader 34 | assert first_block.text_extraction_method == "surya" 35 | 36 | first_text_block: Line = first_page.get_block(first_block.structure[0]) 37 | assert first_text_block.block_type == BlockTypes.Line 38 | 39 | first_span = first_page.get_block(first_text_block.structure[0]) 40 | assert first_span.block_type == BlockTypes.Span 41 | assert first_span.text.strip() == "Subspace Adversarial Training" 42 | assert "bold" in first_span.formats 43 | ``` -------------------------------------------------------------------------------- /marker/services/__init__.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Optional, List, Annotated 2 | from io import BytesIO 3 | 4 | import PIL 5 | from pydantic import BaseModel 6 | 7 | from marker.schema.blocks import Block 8 | from marker.util import assign_config, verify_config_keys 9 | import base64 10 | 11 | 12 | class BaseService: 13 | timeout: Annotated[int, "The timeout to use for the service."] = 30 14 | max_retries: Annotated[ 15 | int, "The maximum number of retries to use for the service." 16 | ] = 2 17 | retry_wait_time: Annotated[int, "The wait time between retries."] = 3 18 | max_output_tokens: Annotated[ 19 | int, "The maximum number of output tokens to generate." 20 | ] = None 21 | 22 | def img_to_base64(self, img: PIL.Image.Image, format: str = "WEBP"): 23 | image_bytes = BytesIO() 24 | img.save(image_bytes, format=format) 25 | return base64.b64encode(image_bytes.getvalue()).decode("utf-8") 26 | 27 | def process_images(self, images: List[PIL.Image.Image]) -> list: 28 | raise NotImplementedError 29 | 30 | def format_image_for_llm(self, image): 31 | if not image: 32 | return [] 33 | 34 | if not isinstance(image, list): 35 | image = [image] 36 | 37 | image_parts = self.process_images(image) 38 | return image_parts 39 | 40 | def __init__(self, config: Optional[BaseModel | dict] = None): 41 | assign_config(self, config) 42 | 43 | # Ensure we have all necessary fields filled out (API keys, etc.) 44 | verify_config_keys(self) 45 | 46 | def __call__( 47 | self, 48 | prompt: str, 49 | image: PIL.Image.Image | List[PIL.Image.Image] | None, 50 | block: Block | None, 51 | response_schema: type[BaseModel], 52 | max_retries: int | None = None, 53 | timeout: int | None = None, 54 | ): 55 | raise NotImplementedError 56 | ``` -------------------------------------------------------------------------------- /tests/providers/test_document_providers.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | 4 | @pytest.mark.config({"page_range": [0]}) 5 | @pytest.mark.filename("lambda.pptx") 6 | def test_pptx_provider(doc_provider): 7 | assert doc_provider.get_images([0], 72)[0].size == (842, 596) 8 | 9 | page_lines = doc_provider.get_page_lines(0) 10 | 11 | spans = page_lines[0].spans 12 | assert spans[0].text == "Lambda Calculus" 13 | 14 | spans = page_lines[1].spans 15 | assert spans[0].text == "CSE 340 – Principles of Programming Languages" 16 | 17 | 18 | @pytest.mark.config({"page_range": [0]}) 19 | @pytest.mark.filename("manual.epub") 20 | def test_epub_provider(doc_provider): 21 | assert doc_provider.get_images([0], 72)[0].size == (596, 842) 22 | 23 | page_lines = doc_provider.get_page_lines(0) 24 | 25 | spans = page_lines[0].spans 26 | assert spans[0].text == "The Project Gutenberg eBook of Simple" 27 | 28 | 29 | @pytest.mark.config({"page_range": [0]}) 30 | @pytest.mark.filename("china.html") 31 | def test_html_provider(doc_provider): 32 | assert doc_provider.get_images([0], 72)[0].size == (596, 842) 33 | 34 | page_lines = doc_provider.get_page_lines(0) 35 | 36 | spans = page_lines[0].spans 37 | assert spans[0].text == "Jump to content" 38 | 39 | @pytest.mark.config({"page_range": [0]}) 40 | @pytest.mark.filename("gatsby.docx") 41 | def test_docx_provider(doc_provider): 42 | assert doc_provider.get_images([0], 72)[0].size == (596, 842) 43 | 44 | page_lines = doc_provider.get_page_lines(0) 45 | 46 | spans = page_lines[0].spans 47 | assert spans[0].text == "Themes" 48 | 49 | 50 | @pytest.mark.config({"page_range": [0]}) 51 | @pytest.mark.filename("single_sheet.xlsx") 52 | def test_xlsx_provider(doc_provider): 53 | assert doc_provider.get_images([0], 72)[0].size == (842, 596) 54 | 55 | page_lines = doc_provider.get_page_lines(0) 56 | 57 | spans = page_lines[0].spans 58 | assert spans[0].text == "Sheet1" ``` -------------------------------------------------------------------------------- /marker/schema/blocks/listitem.py: -------------------------------------------------------------------------------- ```python 1 | import re 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.blocks import Block 5 | 6 | 7 | def replace_bullets(child_blocks): 8 | # Replace bullet characters with a - 9 | first_block = None 10 | while len(child_blocks) > 0: 11 | first_block = child_blocks[0] 12 | child_blocks = first_block.children 13 | 14 | if first_block is not None and first_block.id.block_type == BlockTypes.Line: 15 | bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○ഠ ം◦■▪▫–—-]( )" 16 | first_block.html = re.sub(bullet_pattern, r"\1\2", first_block.html) 17 | 18 | 19 | class ListItem(Block): 20 | block_type: BlockTypes = BlockTypes.ListItem 21 | list_indent_level: int = 0 22 | block_description: str = "A list item that is part of a list. This block is used to represent a single item in a list." 23 | html: str | None = None 24 | 25 | def assemble_html( 26 | self, document, child_blocks, parent_structure, block_config=None 27 | ): 28 | template = super().assemble_html( 29 | document, child_blocks, parent_structure, block_config 30 | ) 31 | template = template.replace("\n", " ") 32 | # Remove the first bullet character 33 | replace_bullets(child_blocks) 34 | 35 | if self.html: 36 | template = ( 37 | super() 38 | .handle_html_output( 39 | document, child_blocks, parent_structure, block_config 40 | ) 41 | .strip() 42 | ) 43 | template = template.replace("<li>", "").replace("</li>", "") 44 | 45 | el_attr = f" block-type='{self.block_type}'" 46 | if self.list_indent_level: 47 | return f"<ul><li{el_attr} class='list-indent-{self.list_indent_level}'>{template}</li></ul>" 48 | return f"<li{el_attr}>{template}</li>" 49 | ``` -------------------------------------------------------------------------------- /marker/providers/image.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List, Annotated 2 | from PIL import Image 3 | 4 | from marker.providers import ProviderPageLines, BaseProvider 5 | from marker.schema.polygon import PolygonBox 6 | from marker.schema.text import Line 7 | from pdftext.schema import Reference 8 | 9 | 10 | class ImageProvider(BaseProvider): 11 | page_range: Annotated[ 12 | List[int], 13 | "The range of pages to process.", 14 | "Default is None, which will process all pages.", 15 | ] = None 16 | 17 | image_count: int = 1 18 | 19 | def __init__(self, filepath: str, config=None): 20 | super().__init__(filepath, config) 21 | 22 | self.images = [Image.open(filepath)] 23 | self.page_lines: ProviderPageLines = {i: [] for i in range(self.image_count)} 24 | 25 | if self.page_range is None: 26 | self.page_range = range(self.image_count) 27 | 28 | assert max(self.page_range) < self.image_count and min(self.page_range) >= 0, ( 29 | f"Invalid page range, values must be between 0 and {len(self.doc) - 1}. Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}." 30 | ) 31 | 32 | self.page_bboxes = { 33 | i: [0, 0, self.images[i].size[0], self.images[i].size[1]] 34 | for i in self.page_range 35 | } 36 | 37 | def __len__(self): 38 | return self.image_count 39 | 40 | def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]: 41 | return [self.images[i] for i in idxs] 42 | 43 | def get_page_bbox(self, idx: int) -> PolygonBox | None: 44 | bbox = self.page_bboxes[idx] 45 | if bbox: 46 | return PolygonBox.from_bbox(bbox) 47 | 48 | def get_page_lines(self, idx: int) -> List[Line]: 49 | return self.page_lines[idx] 50 | 51 | def get_page_refs(self, idx: int) -> List[Reference]: 52 | return [] 53 | ``` -------------------------------------------------------------------------------- /tests/builders/test_ocr_pipeline.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.schema import BlockTypes 4 | from marker.schema.text.line import Line 5 | 6 | 7 | def _ocr_pipeline_test(pdf_document): 8 | first_page = pdf_document.pages[0] 9 | assert first_page.structure[0] == "/page/0/SectionHeader/0" 10 | 11 | first_block = first_page.get_block(first_page.structure[0]) 12 | assert first_block.text_extraction_method == "surya" 13 | assert first_block.block_type == BlockTypes.SectionHeader 14 | 15 | first_text_block: Line = first_page.get_block(first_block.structure[0]) 16 | assert first_text_block.block_type == BlockTypes.Line 17 | 18 | first_span = first_page.get_block(first_text_block.structure[0]) 19 | assert first_span.block_type == BlockTypes.Span 20 | assert first_span.text.strip() == "Subspace Adversarial Training" 21 | 22 | # Ensure we match all text lines up properly 23 | # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes 24 | text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,)) 25 | text_blocks = first_page.contained_blocks( 26 | pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath) 27 | ) 28 | # assert len(text_lines) == 83 29 | 30 | # Ensure the bbox sizes match up 31 | max_line_position = max([line.polygon.y_end for line in text_lines]) 32 | max_block_position = max( 33 | [block.polygon.y_end for block in text_blocks if block.source == "layout"] 34 | ) 35 | assert max_line_position <= (max_block_position * 1.02) 36 | 37 | 38 | @pytest.mark.config({"force_ocr": True, "page_range": [0]}) 39 | def test_ocr_pipeline(pdf_document): 40 | _ocr_pipeline_test(pdf_document) 41 | 42 | 43 | @pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True}) 44 | def test_ocr_with_inline_pipeline(pdf_document): 45 | _ocr_pipeline_test(pdf_document) 46 | ``` -------------------------------------------------------------------------------- /benchmarks/table/gemini.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | from PIL import Image 3 | from google import genai 4 | from google.genai import types 5 | from io import BytesIO 6 | from pydantic import BaseModel 7 | 8 | from marker.settings import settings 9 | 10 | prompt = """ 11 | You're an expert document analyst who is good at turning tables in documents into HTML. Analyze the provided image, and convert it to a faithful HTML representation. 12 | 13 | Guidelines: 14 | - Keep the HTML simple and concise. 15 | - Only include the <table> tag and contents. 16 | - Only use <table>, <tr>, and <td> tags. Only use the colspan and rowspan attributes if necessary. Do not use <tbody>, <thead>, or <th> tags. 17 | - Make sure the table is as faithful to the image as possible with the given tags. 18 | 19 | **Instructions** 20 | 1. Analyze the image, and determine the table structure. 21 | 2. Convert the table image to HTML, following the guidelines above. 22 | 3. Output only the HTML for the table, starting with the <table> tag and ending with the </table> tag. 23 | """.strip() 24 | 25 | class TableSchema(BaseModel): 26 | table_html: str 27 | 28 | def gemini_table_rec(image: Image.Image): 29 | client = genai.Client( 30 | api_key=settings.GOOGLE_API_KEY, 31 | http_options={"timeout": 60000} 32 | ) 33 | 34 | image_bytes = BytesIO() 35 | image.save(image_bytes, format="PNG") 36 | 37 | responses = client.models.generate_content( 38 | model="gemini-2.0-flash", 39 | contents=[types.Part.from_bytes(data=image_bytes.getvalue(), mime_type="image/png"), prompt], # According to gemini docs, it performs better if the image is the first element 40 | config={ 41 | "temperature": 0, 42 | "response_schema": TableSchema, 43 | "response_mime_type": "application/json", 44 | }, 45 | ) 46 | 47 | output = responses.candidates[0].content.parts[0].text 48 | return json.loads(output)["table_html"] ``` -------------------------------------------------------------------------------- /tests/builders/test_pdf_links.py: -------------------------------------------------------------------------------- ```python 1 | import re 2 | 3 | import pytest 4 | 5 | from marker.converters.pdf import PdfConverter 6 | from marker.renderers.markdown import MarkdownOutput 7 | from marker.schema import BlockTypes 8 | from marker.schema.document import Document 9 | from marker.util import classes_to_strings 10 | 11 | 12 | @pytest.mark.filename("arxiv_test.pdf") 13 | @pytest.mark.output_format("markdown") 14 | @pytest.mark.config({"disable_ocr": True}) 15 | def test_pdf_links(pdf_document: Document, config, renderer, model_dict, temp_doc): 16 | first_page = pdf_document.pages[1] 17 | 18 | processors = ["marker.processors.reference.ReferenceProcessor"] 19 | pdf_converter = PdfConverter( 20 | artifact_dict=model_dict, 21 | processor_list=processors, 22 | renderer=classes_to_strings([renderer])[0], 23 | config=config, 24 | ) 25 | 26 | for section_header_span in first_page.contained_blocks( 27 | pdf_document, (BlockTypes.Span,) 28 | ): 29 | if "II." in section_header_span.text: 30 | assert section_header_span.url == "#page-1-0" 31 | break 32 | else: 33 | raise ValueError("Could not find II. in the first page") 34 | 35 | section_header_block = first_page.contained_blocks( 36 | pdf_document, (BlockTypes.SectionHeader,) 37 | )[0] 38 | assert section_header_block.raw_text(pdf_document) == "II. THEORETICAL FRAMEWORK\n" 39 | 40 | assert first_page.refs[0].ref == "page-1-0" 41 | 42 | markdown_output: MarkdownOutput = pdf_converter(temp_doc.name) 43 | markdown = markdown_output.markdown 44 | 45 | assert "[II.](#page-1-0)" in markdown 46 | assert '<span id="page-1-0"></span>II. THEORETICAL FRAMEWORK' in markdown 47 | 48 | for ref in set( 49 | [ 50 | f'<span id="page-{m[0]}-{m[1]}">' 51 | for m in re.findall(r"\]\(#page-(\d+)-(\d+)\)", markdown) 52 | ] 53 | ): 54 | assert ref in markdown, f"Reference {ref} not found in markdown" 55 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/download/llamaparse.py: -------------------------------------------------------------------------------- ```python 1 | import io 2 | import time 3 | 4 | import requests 5 | 6 | from benchmarks.overall.download.base import Downloader 7 | 8 | 9 | class LlamaParseDownloader(Downloader): 10 | service = "llamaparse" 11 | 12 | def get_html(self, pdf_bytes): 13 | rand_name = str(time.time()) + ".pdf" 14 | start = time.time() 15 | buff = io.BytesIO(pdf_bytes) 16 | md = upload_and_parse_file(self.api_key, rand_name, buff) 17 | end = time.time() 18 | if isinstance(md, bytes): 19 | md = md.decode("utf-8") 20 | 21 | return { 22 | "md": md, 23 | "time": end - start, 24 | } 25 | 26 | 27 | def upload_and_parse_file(api_key: str, fname: str, buff, max_retries: int = 180, delay: int = 1): 28 | headers = { 29 | "Authorization": f"Bearer {api_key}", 30 | "Accept": "application/json" 31 | } 32 | 33 | # Upload file 34 | files = { 35 | 'file': (fname, buff, 'application/pdf') 36 | } 37 | response = requests.post( 38 | 'https://api.cloud.llamaindex.ai/api/v1/parsing/upload', 39 | headers=headers, 40 | files=files 41 | ) 42 | response.raise_for_status() 43 | job_id = response.json()['id'] 44 | 45 | # Poll for completion 46 | for _ in range(max_retries): 47 | status_response = requests.get( 48 | f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}', 49 | headers=headers 50 | ) 51 | status_response.raise_for_status() 52 | if status_response.json()['status'] == 'SUCCESS': 53 | # Get results 54 | result_response = requests.get( 55 | f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown', 56 | headers=headers 57 | ) 58 | result_response.raise_for_status() 59 | return result_response.json()['markdown'] 60 | 61 | time.sleep(delay) 62 | 63 | raise TimeoutError("Job did not complete within the maximum retry attempts") ``` -------------------------------------------------------------------------------- /benchmarks/overall/download/mistral.py: -------------------------------------------------------------------------------- ```python 1 | import io 2 | import time 3 | import requests 4 | 5 | from benchmarks.overall.download.base import Downloader 6 | 7 | 8 | class MistralDownloader(Downloader): 9 | service = "mistral" 10 | 11 | def get_html(self, pdf_bytes): 12 | rand_name = str(time.time()) + ".pdf" 13 | start = time.time() 14 | buff = io.BytesIO(pdf_bytes) 15 | md = upload_and_process_file(self.api_key, rand_name, buff) 16 | end = time.time() 17 | if isinstance(md, bytes): 18 | md = md.decode("utf-8") 19 | 20 | return { 21 | "md": md, 22 | "time": end - start, 23 | } 24 | 25 | 26 | def upload_and_process_file(api_key: str, fname: str, buff): 27 | headers = { 28 | "Authorization": f"Bearer {api_key}" 29 | } 30 | 31 | upload_headers = headers.copy() 32 | files = { 33 | 'file': (fname, buff, 'application/pdf'), 34 | 'purpose': (None, 'ocr') 35 | } 36 | 37 | upload_response = requests.post( 38 | 'https://api.mistral.ai/v1/files', 39 | headers=upload_headers, 40 | files=files 41 | ) 42 | upload_response.raise_for_status() 43 | file_id = upload_response.json()['id'] 44 | 45 | url_headers = headers.copy() 46 | url_headers["Accept"] = "application/json" 47 | 48 | url_response = requests.get( 49 | f'https://api.mistral.ai/v1/files/{file_id}/url?expiry=24', 50 | headers=url_headers 51 | ) 52 | url_response.raise_for_status() 53 | signed_url = url_response.json()['url'] 54 | 55 | ocr_headers = headers.copy() 56 | ocr_headers["Content-Type"] = "application/json" 57 | 58 | ocr_data = { 59 | "model": "mistral-ocr-latest", 60 | "document": { 61 | "type": "document_url", 62 | "document_url": signed_url 63 | }, 64 | "include_image_base64": True 65 | } 66 | ocr_response = requests.post( 67 | 'https://api.mistral.ai/v1/ocr', 68 | headers=ocr_headers, 69 | json=ocr_data 70 | ) 71 | ocr_response.raise_for_status() 72 | result = ocr_response.json() 73 | return result["pages"][0]["markdown"] ``` -------------------------------------------------------------------------------- /tests/config/test_config.py: -------------------------------------------------------------------------------- ```python 1 | import sys 2 | from contextlib import suppress 3 | import click 4 | 5 | from marker.config.printer import CustomClickPrinter 6 | from marker.config.crawler import crawler 7 | from marker.config.parser import ConfigParser 8 | 9 | 10 | def capture_kwargs(argv): 11 | command = click.command(cls=CustomClickPrinter) 12 | captured_kwargs = {} 13 | 14 | def parse_args(**kwargs): 15 | captured_kwargs.update(kwargs) 16 | return kwargs 17 | 18 | original_argv = sys.argv 19 | sys.argv = argv 20 | try: 21 | with suppress(SystemExit): 22 | command(ConfigParser.common_options(parse_args))() 23 | finally: 24 | sys.argv = original_argv 25 | 26 | return captured_kwargs 27 | 28 | 29 | def test_config_parser(): 30 | sys.argv = [ 31 | "test", 32 | "--disable_multiprocessing", 33 | "--output_dir", 34 | "output_dir", 35 | "--height_tolerance", 36 | "0.5", 37 | ] 38 | kwargs = capture_kwargs(sys.argv) 39 | parser = ConfigParser(kwargs) 40 | config_dict = parser.generate_config_dict() 41 | 42 | # Validate kwarg capturing 43 | assert kwargs["disable_multiprocessing"] 44 | assert kwargs["output_dir"] == "output_dir" 45 | 46 | assert config_dict["pdftext_workers"] == 1 # disabling multiprocessing does this 47 | assert config_dict["height_tolerance"] == 0.5 48 | 49 | 50 | def test_config_none(): 51 | kwargs = capture_kwargs(["test"]) 52 | 53 | for key in crawler.attr_set: 54 | # We force some options to become flags for ease of use on the CLI 55 | value = None 56 | assert kwargs.get(key) is value 57 | 58 | 59 | def test_config_llm(): 60 | kwargs = capture_kwargs(["test", "--use_llm"]) 61 | parser = ConfigParser(kwargs) 62 | config_dict = parser.generate_config_dict() 63 | 64 | # Validate kwarg capturing 65 | assert config_dict["use_llm"] 66 | 67 | 68 | def test_config_force_ocr(): 69 | kwargs = capture_kwargs(["test", "--force_ocr"]) 70 | parser = ConfigParser(kwargs) 71 | config_dict = parser.generate_config_dict() 72 | 73 | # Validate kwarg capturing 74 | assert config_dict["force_ocr"] 75 | ``` -------------------------------------------------------------------------------- /tests/converters/test_ocr_converter.py: -------------------------------------------------------------------------------- ```python 1 | import pytest 2 | 3 | from marker.converters.ocr import OCRConverter 4 | from marker.renderers.ocr_json import OCRJSONOutput, OCRJSONPageOutput 5 | 6 | 7 | def _ocr_converter(config, model_dict, temp_pdf, line_count: int, eq_count: int): 8 | converter = OCRConverter(artifact_dict=model_dict, config=config) 9 | 10 | ocr_json: OCRJSONOutput = converter(temp_pdf.name) 11 | pages = ocr_json.children 12 | 13 | assert len(pages) == 1 14 | # assert len(pages[0].children) == line_count 15 | eqs = [line for line in pages[0].children if line.block_type == "Equation"] 16 | assert len(eqs) == eq_count 17 | return pages 18 | 19 | 20 | def check_bboxes(page: OCRJSONPageOutput, lines): 21 | page_size = page.bbox 22 | for line in lines: 23 | assert len(line.children) > 0 24 | for child in line.children: 25 | bbox = child.bbox 26 | assert all( 27 | [ 28 | bbox[0] >= page_size[0], 29 | bbox[1] >= page_size[1], 30 | bbox[2] <= page_size[2], 31 | bbox[3] <= page_size[3], 32 | ] 33 | ), "Child bbox is outside page bbox" 34 | 35 | 36 | @pytest.mark.config({"page_range": [0]}) 37 | def test_ocr_converter(config, model_dict, temp_doc): 38 | _ocr_converter(config, model_dict, temp_doc, 85, 2) 39 | 40 | 41 | @pytest.mark.filename("pres.pdf") 42 | @pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True}) 43 | def test_ocr_converter_force(config, model_dict, temp_doc): 44 | pages = _ocr_converter(config, model_dict, temp_doc, 10, 0) 45 | lines = [line for line in pages[0].children if line.block_type == "Line"] 46 | check_bboxes(pages[0], lines) 47 | 48 | 49 | @pytest.mark.filename("pres.pdf") 50 | @pytest.mark.config({"page_range": [1], "keep_chars": True}) 51 | def test_ocr_converter_keep(config, model_dict, temp_doc): 52 | pages = _ocr_converter(config, model_dict, temp_doc, 10, 0) 53 | lines = [line for line in pages[0].children if line.block_type == "Line"] 54 | check_bboxes(pages[0], lines) 55 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/display/dataset.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | from typing import List 3 | 4 | import datasets 5 | from tqdm import tqdm 6 | 7 | from benchmarks.overall.registry import METHOD_REGISTRY 8 | from benchmarks.overall.schema import FullResult 9 | 10 | 11 | def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, score_types: List[str], max_rows: int | None = None) -> datasets.Dataset: 12 | rows = [] 13 | for idx, sample in tqdm(enumerate(bench_dataset), desc="Building dataset"): 14 | if idx not in result["markdown"]: 15 | continue 16 | 17 | if max_rows is not None and idx >= max_rows: 18 | break 19 | 20 | row = { 21 | "uuid": sample["uuid"], 22 | "classification": sample["classification"], 23 | "language": sample["language"], 24 | "img": sample["img"], 25 | } 26 | for method in result["markdown"][idx]: 27 | if method == "gt": 28 | continue 29 | 30 | method_cls = METHOD_REGISTRY[method]() 31 | md = result["markdown"][idx][method] 32 | try: 33 | method_img = method_cls.render(result["markdown"][idx][method]) 34 | except Exception as e: 35 | # This can happen when the markdown is None 36 | method_img = PIL.Image.new("RGB", (200, 200)) 37 | 38 | row[f"{method}_md"] = md 39 | row[f"{method}_img"] = method_img 40 | 41 | for score_type in score_types: 42 | try: 43 | row[f"{method}_{score_type}"] = result["scores"][idx][method][score_type]["score"] 44 | except KeyError: 45 | row[f"{method}_{score_type}"] = -1.0 # Missing score 46 | try: 47 | row[f"{method}_{score_type}_detail"] = json.dumps(result["scores"][idx][method][score_type]["specific_scores"]) 48 | except KeyError: 49 | row[f"{method}_{score_type}_detail"] = "" # Missing detail 50 | rows.append(row) 51 | ds = datasets.Dataset.from_list(rows) 52 | return ds 53 | 54 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/download/base.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | from json import JSONDecodeError 3 | from pathlib import Path 4 | 5 | import datasets 6 | from tqdm import tqdm 7 | 8 | 9 | class Downloader: 10 | cache_path: Path = Path("cache") 11 | service: str 12 | 13 | def __init__(self, api_key, app_id, max_rows: int = 2200): 14 | self.cache_path.mkdir(exist_ok=True) 15 | self.max_rows = max_rows 16 | self.api_key = api_key 17 | self.app_id = app_id 18 | self.ds = datasets.load_dataset("datalab-to/marker_benchmark", split="train") 19 | 20 | def get_html(self, pdf_bytes): 21 | raise NotImplementedError 22 | 23 | def upload_ds(self): 24 | rows = [] 25 | for file in self.cache_path.glob("*.json"): 26 | with open(file, "r") as f: 27 | data = json.load(f) 28 | rows.append(data) 29 | 30 | out_ds = datasets.Dataset.from_list(rows, features=datasets.Features({ 31 | "md": datasets.Value("string"), 32 | "uuid": datasets.Value("string"), 33 | "time": datasets.Value("float"), 34 | })) 35 | out_ds.push_to_hub(f"datalab-to/marker_benchmark_{self.service}", private=True) 36 | 37 | def generate_data(self): 38 | max_rows = self.max_rows 39 | for idx, sample in tqdm(enumerate(self.ds), desc=f"Saving {self.service} results"): 40 | cache_file = self.cache_path / f"{idx}.json" 41 | if cache_file.exists(): 42 | continue 43 | 44 | pdf_bytes = sample["pdf"] # This is a single page PDF 45 | try: 46 | out_data = self.get_html(pdf_bytes) 47 | except JSONDecodeError as e: 48 | print(f"Error with sample {idx}: {e}") 49 | continue 50 | except Exception as e: 51 | print(f"Error with sample {idx}: {e}") 52 | continue 53 | out_data["uuid"] = sample["uuid"] 54 | 55 | with cache_file.open("w") as f: 56 | json.dump(out_data, f) 57 | 58 | if idx >= max_rows: 59 | break 60 | 61 | def __call__(self): 62 | self.generate_data() 63 | self.upload_ds() 64 | ``` -------------------------------------------------------------------------------- /marker/processors/reference.py: -------------------------------------------------------------------------------- ```python 1 | import numpy as np 2 | 3 | from marker.processors import BaseProcessor 4 | from marker.schema import BlockTypes 5 | from marker.schema.blocks import Reference 6 | from marker.schema.document import Document 7 | from marker.schema.groups.list import ListGroup 8 | from marker.schema.groups.table import TableGroup 9 | from marker.schema.registry import get_block_class 10 | from marker.schema.groups.figure import FigureGroup 11 | 12 | 13 | class ReferenceProcessor(BaseProcessor): 14 | """ 15 | A processor for adding references to the document. 16 | """ 17 | 18 | def __init__(self, config): 19 | super().__init__(config) 20 | 21 | def __call__(self, document: Document): 22 | ReferenceClass: Reference = get_block_class(BlockTypes.Reference) 23 | 24 | for page in document.pages: 25 | refs = page.refs 26 | ref_starts = np.array([ref.coord for ref in refs]) 27 | 28 | blocks = [] 29 | for block_id in page.structure: 30 | block = page.get_block(block_id) 31 | if isinstance(block, (ListGroup, FigureGroup, TableGroup)): 32 | blocks.extend([page.get_block(b) for b in block.structure]) 33 | else: 34 | blocks.append(block) 35 | blocks = [b for b in blocks if not b.ignore_for_output] 36 | 37 | block_starts = np.array([block.polygon.bbox[:2] for block in blocks]) 38 | 39 | if not (len(refs) and len(block_starts)): 40 | continue 41 | 42 | distances = np.linalg.norm(block_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2) 43 | for ref_idx in range(len(ref_starts)): 44 | block_idx = np.argmin(distances[:, ref_idx]) 45 | block = blocks[block_idx] 46 | 47 | ref_block = page.add_full_block(ReferenceClass( 48 | ref=refs[ref_idx].ref, 49 | polygon=block.polygon, 50 | page_id=page.page_id 51 | )) 52 | if block.structure is None: 53 | block.structure = [] 54 | block.structure.insert(0, ref_block.id) 55 | ``` -------------------------------------------------------------------------------- /marker/builders/document.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Annotated 2 | 3 | from marker.builders import BaseBuilder 4 | from marker.builders.layout import LayoutBuilder 5 | from marker.builders.line import LineBuilder 6 | from marker.builders.ocr import OcrBuilder 7 | from marker.providers.pdf import PdfProvider 8 | from marker.schema import BlockTypes 9 | from marker.schema.document import Document 10 | from marker.schema.groups.page import PageGroup 11 | from marker.schema.registry import get_block_class 12 | 13 | 14 | class DocumentBuilder(BaseBuilder): 15 | """ 16 | Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder. 17 | """ 18 | lowres_image_dpi: Annotated[ 19 | int, 20 | "DPI setting for low-resolution page images used for Layout and Line Detection.", 21 | ] = 96 22 | highres_image_dpi: Annotated[ 23 | int, 24 | "DPI setting for high-resolution page images used for OCR.", 25 | ] = 192 26 | disable_ocr: Annotated[ 27 | bool, 28 | "Disable OCR processing.", 29 | ] = False 30 | 31 | def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, line_builder: LineBuilder, ocr_builder: OcrBuilder): 32 | document = self.build_document(provider) 33 | layout_builder(document, provider) 34 | line_builder(document, provider) 35 | if not self.disable_ocr: 36 | ocr_builder(document, provider) 37 | return document 38 | 39 | def build_document(self, provider: PdfProvider): 40 | PageGroupClass: PageGroup = get_block_class(BlockTypes.Page) 41 | lowres_images = provider.get_images(provider.page_range, self.lowres_image_dpi) 42 | highres_images = provider.get_images(provider.page_range, self.highres_image_dpi) 43 | initial_pages = [ 44 | PageGroupClass( 45 | page_id=p, 46 | lowres_image=lowres_images[i], 47 | highres_image=highres_images[i], 48 | polygon=provider.get_page_bbox(p), 49 | refs=provider.get_page_refs(p) 50 | ) for i, p in enumerate(provider.page_range) 51 | ] 52 | DocumentClass: Document = get_block_class(BlockTypes.Document) 53 | return DocumentClass(filepath=provider.filepath, pages=initial_pages) 54 | ``` -------------------------------------------------------------------------------- /marker/services/ollama.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | from typing import Annotated, List 3 | 4 | import PIL 5 | import requests 6 | from marker.logger import get_logger 7 | from pydantic import BaseModel 8 | 9 | from marker.schema.blocks import Block 10 | from marker.services import BaseService 11 | 12 | logger = get_logger() 13 | 14 | 15 | class OllamaService(BaseService): 16 | ollama_base_url: Annotated[ 17 | str, "The base url to use for ollama. No trailing slash." 18 | ] = "http://localhost:11434" 19 | ollama_model: Annotated[str, "The model name to use for ollama."] = ( 20 | "llama3.2-vision" 21 | ) 22 | 23 | def process_images(self, images): 24 | image_bytes = [self.img_to_base64(img) for img in images] 25 | return image_bytes 26 | 27 | def __call__( 28 | self, 29 | prompt: str, 30 | image: PIL.Image.Image | List[PIL.Image.Image] | None, 31 | block: Block | None, 32 | response_schema: type[BaseModel], 33 | max_retries: int | None = None, 34 | timeout: int | None = None, 35 | ): 36 | url = f"{self.ollama_base_url}/api/generate" 37 | headers = {"Content-Type": "application/json"} 38 | 39 | schema = response_schema.model_json_schema() 40 | format_schema = { 41 | "type": "object", 42 | "properties": schema["properties"], 43 | "required": schema["required"], 44 | } 45 | 46 | image_bytes = self.format_image_for_llm(image) 47 | 48 | payload = { 49 | "model": self.ollama_model, 50 | "prompt": prompt, 51 | "stream": False, 52 | "format": format_schema, 53 | "images": image_bytes, 54 | } 55 | 56 | try: 57 | response = requests.post(url, json=payload, headers=headers) 58 | response.raise_for_status() 59 | response_data = response.json() 60 | 61 | total_tokens = ( 62 | response_data["prompt_eval_count"] + response_data["eval_count"] 63 | ) 64 | 65 | if block: 66 | block.update_metadata(llm_request_count=1, llm_tokens_used=total_tokens) 67 | 68 | data = response_data["response"] 69 | return json.loads(data) 70 | except Exception as e: 71 | logger.warning(f"Ollama inference failed: {e}") 72 | 73 | return {} 74 | ```