This is page 1 of 7. Use http://codebase.md/datalab-to/marker?page={x} to view the full context. # Directory Structure ``` ├── .github │ ├── ISSUE_TEMPLATE │ │ ├── breaking-bug-report.md │ │ ├── feature_request.md │ │ └── output-bug-report.md │ └── workflows │ ├── benchmarks.yml │ ├── ci.yml │ ├── cla.yml │ ├── publish.yml │ └── scripts.yml ├── .gitignore ├── .pre-commit-config.yaml ├── benchmarks │ ├── __init__.py │ ├── overall │ │ ├── __init__.py │ │ ├── display │ │ │ ├── __init__.py │ │ │ ├── dataset.py │ │ │ └── table.py │ │ ├── download │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── llamaparse.py │ │ │ ├── main.py │ │ │ ├── mathpix.py │ │ │ └── mistral.py │ │ ├── elo.py │ │ ├── methods │ │ │ ├── __init__.py │ │ │ ├── docling.py │ │ │ ├── gt.py │ │ │ ├── llamaparse.py │ │ │ ├── marker.py │ │ │ ├── mathpix.py │ │ │ ├── mistral.py │ │ │ ├── olmocr.py │ │ │ └── schema.py │ │ ├── overall.py │ │ ├── registry.py │ │ ├── schema.py │ │ └── scorers │ │ ├── __init__.py │ │ ├── clean.py │ │ ├── heuristic.py │ │ ├── llm.py │ │ └── schema.py │ ├── table │ │ ├── __init__.py │ │ ├── gemini.py │ │ ├── inference.py │ │ ├── scoring.py │ │ └── table.py │ ├── throughput │ │ ├── __init__.py │ │ └── main.py │ └── verify_scores.py ├── chunk_convert.py ├── CLA.md ├── convert_single.py ├── convert.py ├── data │ ├── .gitignore │ ├── examples │ │ ├── json │ │ │ ├── multicolcnn.json │ │ │ ├── switch_trans.json │ │ │ └── thinkpython.json │ │ └── markdown │ │ ├── multicolcnn │ │ │ ├── _page_1_Figure_0.jpeg │ │ │ ├── _page_2_Picture_0.jpeg │ │ │ ├── _page_6_Figure_0.jpeg │ │ │ ├── _page_7_Figure_0.jpeg │ │ │ ├── multicolcnn_meta.json │ │ │ └── multicolcnn.md │ │ ├── switch_transformers │ │ │ ├── _page_11_Figure_4.jpeg │ │ │ ├── _page_12_Figure_4.jpeg │ │ │ ├── _page_13_Figure_2.jpeg │ │ │ ├── _page_18_Figure_1.jpeg │ │ │ ├── _page_18_Figure_3.jpeg │ │ │ ├── _page_2_Figure_3.jpeg │ │ │ ├── _page_20_Figure_1.jpeg │ │ │ ├── _page_20_Figure_4.jpeg │ │ │ ├── _page_27_Figure_1.jpeg │ │ │ ├── _page_29_Figure_1.jpeg │ │ │ ├── _page_30_Figure_1.jpeg │ │ │ ├── _page_31_Figure_3.jpeg │ │ │ ├── _page_4_Figure_1.jpeg │ │ │ ├── _page_5_Figure_3.jpeg │ │ │ ├── switch_trans_meta.json │ │ │ └── switch_trans.md │ │ └── thinkpython │ │ ├── _page_109_Figure_1.jpeg │ │ ├── _page_115_Figure_1.jpeg │ │ ├── _page_116_Figure_3.jpeg │ │ ├── _page_127_Figure_1.jpeg │ │ ├── _page_128_Figure_1.jpeg │ │ ├── _page_167_Figure_1.jpeg │ │ ├── _page_169_Figure_1.jpeg │ │ ├── _page_173_Figure_1.jpeg │ │ ├── _page_190_Figure_1.jpeg │ │ ├── _page_195_Figure_1.jpeg │ │ ├── _page_205_Figure_1.jpeg │ │ ├── _page_23_Figure_1.jpeg │ │ ├── _page_23_Figure_3.jpeg │ │ ├── _page_230_Figure_1.jpeg │ │ ├── _page_233_Figure_1.jpeg │ │ ├── _page_233_Figure_3.jpeg │ │ ├── _page_234_Figure_1.jpeg │ │ ├── _page_235_Figure_1.jpeg │ │ ├── _page_236_Figure_1.jpeg │ │ ├── _page_236_Figure_3.jpeg │ │ ├── _page_237_Figure_1.jpeg │ │ ├── _page_238_Figure_1.jpeg │ │ ├── _page_46_Figure_1.jpeg │ │ ├── _page_60_Figure_1.jpeg │ │ ├── _page_60_Figure_3.jpeg │ │ ├── _page_67_Figure_1.jpeg │ │ ├── _page_71_Figure_1.jpeg │ │ ├── _page_78_Figure_1.jpeg │ │ ├── _page_85_Figure_1.jpeg │ │ ├── _page_94_Figure_1.jpeg │ │ ├── _page_99_Figure_17.jpeg │ │ ├── _page_99_Figure_178.jpeg │ │ ├── thinkpython_meta.json │ │ └── thinkpython.md │ ├── images │ │ ├── overall.png │ │ ├── per_doc.png │ │ └── table.png │ └── latex_to_md.sh ├── examples │ ├── marker_modal_deployment.py │ └── README.md ├── extraction_app.py ├── LICENSE ├── marker │ ├── builders │ │ ├── __init__.py │ │ ├── document.py │ │ ├── layout.py │ │ ├── line.py │ │ ├── ocr.py │ │ └── structure.py │ ├── config │ │ ├── __init__.py │ │ ├── crawler.py │ │ ├── parser.py │ │ └── printer.py │ ├── converters │ │ ├── __init__.py │ │ ├── extraction.py │ │ ├── ocr.py │ │ ├── pdf.py │ │ └── table.py │ ├── extractors │ │ ├── __init__.py │ │ ├── document.py │ │ └── page.py │ ├── logger.py │ ├── models.py │ ├── output.py │ ├── processors │ │ ├── __init__.py │ │ ├── blank_page.py │ │ ├── block_relabel.py │ │ ├── blockquote.py │ │ ├── code.py │ │ ├── debug.py │ │ ├── document_toc.py │ │ ├── equation.py │ │ ├── footnote.py │ │ ├── ignoretext.py │ │ ├── line_merge.py │ │ ├── line_numbers.py │ │ ├── list.py │ │ ├── llm │ │ │ ├── __init__.py │ │ │ ├── llm_complex.py │ │ │ ├── llm_equation.py │ │ │ ├── llm_form.py │ │ │ ├── llm_handwriting.py │ │ │ ├── llm_image_description.py │ │ │ ├── llm_mathblock.py │ │ │ ├── llm_meta.py │ │ │ ├── llm_page_correction.py │ │ │ ├── llm_sectionheader.py │ │ │ ├── llm_table_merge.py │ │ │ └── llm_table.py │ │ ├── order.py │ │ ├── page_header.py │ │ ├── reference.py │ │ ├── sectionheader.py │ │ ├── table.py │ │ ├── text.py │ │ └── util.py │ ├── providers │ │ ├── __init__.py │ │ ├── document.py │ │ ├── epub.py │ │ ├── html.py │ │ ├── image.py │ │ ├── pdf.py │ │ ├── powerpoint.py │ │ ├── registry.py │ │ ├── spreadsheet.py │ │ └── utils.py │ ├── renderers │ │ ├── __init__.py │ │ ├── chunk.py │ │ ├── extraction.py │ │ ├── html.py │ │ ├── json.py │ │ ├── markdown.py │ │ └── ocr_json.py │ ├── schema │ │ ├── __init__.py │ │ ├── blocks │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── basetable.py │ │ │ ├── caption.py │ │ │ ├── code.py │ │ │ ├── complexregion.py │ │ │ ├── equation.py │ │ │ ├── figure.py │ │ │ ├── footnote.py │ │ │ ├── form.py │ │ │ ├── handwriting.py │ │ │ ├── inlinemath.py │ │ │ ├── listitem.py │ │ │ ├── pagefooter.py │ │ │ ├── pageheader.py │ │ │ ├── picture.py │ │ │ ├── reference.py │ │ │ ├── sectionheader.py │ │ │ ├── table.py │ │ │ ├── tablecell.py │ │ │ ├── text.py │ │ │ └── toc.py │ │ ├── document.py │ │ ├── groups │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── figure.py │ │ │ ├── list.py │ │ │ ├── page.py │ │ │ ├── picture.py │ │ │ └── table.py │ │ ├── polygon.py │ │ ├── registry.py │ │ └── text │ │ ├── __init__.py │ │ ├── char.py │ │ ├── line.py │ │ └── span.py │ ├── scripts │ │ ├── __init__.py │ │ ├── chunk_convert.py │ │ ├── chunk_convert.sh │ │ ├── common.py │ │ ├── convert_single.py │ │ ├── convert.py │ │ ├── extraction_app.py │ │ ├── file_to_s3.py │ │ ├── run_streamlit_app.py │ │ ├── server.py │ │ └── streamlit_app.py │ ├── services │ │ ├── __init__.py │ │ ├── azure_openai.py │ │ ├── claude.py │ │ ├── gemini.py │ │ ├── ollama.py │ │ ├── openai.py │ │ └── vertex.py │ ├── settings.py │ ├── util.py │ └── utils │ ├── __init__.py │ ├── batch.py │ ├── gpu.py │ └── image.py ├── marker_app.py ├── marker_server.py ├── poetry.lock ├── pyproject.toml ├── pytest.ini ├── README.md ├── signatures │ └── version1 │ └── cla.json ├── static │ └── fonts │ └── .gitignore └── tests ├── builders │ ├── test_blank_page.py │ ├── test_document_builder.py │ ├── test_garbled_pdf.py │ ├── test_layout_replace.py │ ├── test_ocr_builder.py │ ├── test_ocr_pipeline.py │ ├── test_overriding.py │ ├── test_pdf_links.py │ ├── test_rotated_bboxes.py │ ├── test_strip_existing_ocr.py │ └── test_structure.py ├── config │ └── test_config.py ├── conftest.py ├── converters │ ├── test_extraction_converter.py │ ├── test_ocr_converter.py │ ├── test_pdf_converter.py │ └── test_table_converter.py ├── processors │ ├── test_document_toc_processor.py │ ├── test_equation_processor.py │ ├── test_footnote_processor.py │ ├── test_ignoretext.py │ ├── test_llm_processors.py │ ├── test_table_merge.py │ └── test_table_processor.py ├── providers │ ├── test_document_providers.py │ ├── test_image_provider.py │ └── test_pdf_provider.py ├── renderers │ ├── test_chunk_renderer.py │ ├── test_extract_images.py │ ├── test_html_renderer.py │ ├── test_json_renderer.py │ └── test_markdown_renderer.py ├── schema │ └── groups │ └── test_list_grouping.py ├── services │ └── test_service_init.py └── utils.py ``` # Files -------------------------------------------------------------------------------- /static/fonts/.gitignore: -------------------------------------------------------------------------------- ``` * !.gitignore ``` -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- ``` latex pdfs references ``` -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- ```yaml repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. rev: v0.9.10 hooks: # Run the linter. - id: ruff types_or: [ python, pyi ] args: [ --fix ] # Run the formatter. - id: ruff-format types_or: [ python, pyi ] ``` -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- ``` private.py .DS_Store local.env experiments test_data training wandb *.dat report.json benchmark_data debug_data temp.md temp conversion_results uploads /cache # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ .vscode/ ``` -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- ```markdown ## Usage Examples This directory contains examples of running `marker` in different contexts. ### Usage with Modal We have a [self-contained example](./marker_modal_deployment.py) that shows how you can quickly use [Modal](https://modal.com) to deploy `marker` by provisioning a container with a GPU, and expose that with an API so you can submit PDFs for conversion into Markdown, HTML, or JSON. It's a limited example that you can extend into different use cases. #### Pre-requisites Make sure you have the `modal` client installed by [following their instructions here](https://modal.com/docs/guide#getting-started). Modal's [Starter Plan](https://modal.com/pricing) includes $30 of free compute each month. Modal is [serverless](https://arxiv.org/abs/1902.03383), so you only pay for resources when you are using them. #### Running the example Once `modal` is configured, you can deploy it to your workspace by running: > modal deploy marker_modal_deployment.py Notes: - `marker` has a few models it uses. By default, the endpoint will check if these models are loaded and download them if not (first request will be slow). You can avoid this by running > modal run marker_modal_deployment.py::download_models Which will create a [`Modal Volume`](https://modal.com/docs/guide/Volumes) to store them for re-use. Once the deploy is finished, you can: - Test a file upload locally through your CLI using an `invoke_conversion` command we expose through Modal's [`local_entrypoint`](https://modal.com/docs/reference/modal.App#local_entrypoint) - Get the URL of your endpoint and make a request through a client of your choice. **Test from your CLI with `invoke_conversion`** If your endpoint is live, simply run this command: ``` $ modal run marker_modal_deployment.py::invoke_conversion --pdf-file <PDF_FILE_PATH> --output-format markdown ``` And it'll automatically detect the URL of your new endpoint using [`.get_web_url()`](https://modal.com/docs/guide/webhook-urls#determine-the-url-of-a-web-endpoint-from-code), make sure it's healthy, submit your file, and store its output on your machine (in the same directory). **Making a request using your own client** If you want to make requests elsewhere e.g. with cURL or a client like Insomnia, you'll need to get the URL. When your `modal deploy` command from earlier finishes, it'll include your endpoint URL at the end. For example: ``` $ modal deploy marker_modal_deployment.py ... ✓ Created objects. ├── 🔨 Created mount /marker/examples/marker_modal_deployment.py ├── 🔨 Created function download_models. ├── 🔨 Created function MarkerModalDemoService.*. └── 🔨 Created web endpoint for MarkerModalDemoService.fastapi_app => <YOUR_ENDPOINT_URL> ✓ App deployed in 149.877s! 🎉 ``` If you accidentally close your terminal session, you can also always go into Modal's dashboard and: - Find the app (default name: `datalab-marker-modal-demo`) - Click on `MarkerModalDemoService` - Find your endpoint URL Once you have your URL, make a request to `{YOUR_ENDPOINT_URL}/convert` like this (you can also use Insomnia, etc.): ``` curl --request POST \ --url {BASE_URL}/convert \ --header 'Content-Type: multipart/form-data' \ --form file=@/Users/cooldev/sample.pdf \ --form output_format=html ``` You should get a response like this ``` { "success": true, "filename": "sample.pdf", "output_format": "html", "json": null, "html": "<YOUR_RESPONSE_CONTENT>", "markdown": null, "images": {}, "metadata": {... page level metadata ...}, "page_count": 2 } ``` [Modal](https://modal.com) makes deploying and scaling models and inference workloads much easier. If you're interested in Datalab's managed API or on-prem document intelligence solution, check out [our platform here](https://datalab.to/?utm_source=gh-marker). ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- ```markdown # Marker Marker converts documents to markdown, JSON, chunks, and HTML quickly and accurately. - Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages - Formats tables, forms, equations, inline math, links, references, and code blocks - Extracts and saves images - Removes headers/footers/other artifacts - Extensible with your own formatting and logic - Does structured extraction, given a JSON schema (beta) - Optionally boost accuracy with LLMs (and your own prompt) - Works on GPU, CPU, or MPS For our managed API or on-prem document intelligence solution, check out [our platform here](https://datalab.to?utm_source=gh-marker). ## Performance <img src="data/images/overall.png" width="800px"/> Marker benchmarks favorably compared to cloud services like Llamaparse and Mathpix, as well as other open source tools. The above results are running single PDF pages serially. Marker is significantly faster when running in batch mode, with a projected throughput of 25 pages/second on an H100. See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks. ## Hybrid Mode For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker. This will do things like merge tables across pages, handle inline math, format tables properly, and extract values from forms. It can use any gemini or ollama model. By default, it uses `gemini-2.0-flash`. See [below](#llm-services) for details. Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm: <img src="data/images/table.png" width="400px"/> As you can see, the use_llm mode offers higher accuracy than marker or gemini alone. ## Examples | PDF | File type | Markdown | JSON | |-----|-----------|------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------| | [Think Python](https://greenteapress.com/thinkpython/thinkpython.pdf) | Textbook | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/thinkpython/thinkpython.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/thinkpython.json) | | [Switch Transformers](https://arxiv.org/pdf/2101.03961.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/switch_transformers/switch_trans.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/switch_trans.json) | | [Multi-column CNN](https://arxiv.org/pdf/1804.07821.pdf) | arXiv paper | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/markdown/multicolcnn/multicolcnn.md) | [View](https://github.com/VikParuchuri/marker/blob/master/data/examples/json/multicolcnn.json) | # Commercial usage Our model weights use a modified AI Pubs Open Rail-M license (free for research, personal use, and startups under $2M funding/revenue) and our code is GPL. For broader commercial licensing or to remove GPL requirements, visit our pricing page [here](https://www.datalab.to/pricing?utm_source=gh-marker). # Hosted API & On-prem There's a [hosted API](https://www.datalab.to?utm_source=gh-marker) and [painless on-prem solution](https://www.datalab.to/blog/self-serve-on-prem-licensing) for marker - it's free to sign up, and we'll throw in credits for you to test it out. The API: - Supports PDF, image, PPT, PPTX, DOC, DOCX, XLS, XLSX, HTML, EPUB files - Is 1/4th the price of leading cloud-based competitors - Fast - ~15s for a 250 page PDF - Supports LLM mode - High uptime (99.99%) # Community [Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development. # Installation You'll need python 3.10+ and [PyTorch](https://pytorch.org/get-started/locally/). Install with: ```shell pip install marker-pdf ``` If you want to use marker on documents other than PDFs, you will need to install additional dependencies with: ```shell pip install marker-pdf[full] ``` # Usage First, some configuration: - Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`. - Some PDFs, even digital ones, have bad text in them. Set `--force_ocr` to force OCR on all lines, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text. - If you care about inline math, set `force_ocr` to convert inline math to LaTeX. ## Interactive App I've included a streamlit app that lets you interactively try marker with some basic options. Run it with: ```shell pip install streamlit streamlit-ace marker_gui ``` ## Convert a single file ```shell marker_single /path/to/file.pdf ``` You can pass in PDFs or images. Options: - `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20. - `--output_format [markdown|json|html|chunks]`: Specify the format for the output results. - `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR. - `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n` - `--use_llm`: Uses an LLM to improve accuracy. You will need to configure the LLM backend - see [below](#llm-services). - `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text. This will also format inline math properly. - `--block_correction_prompt`: if LLM mode is active, an optional prompt that will be used to correct the output of marker. This is useful for custom formatting or logic that you want to apply to the output. - `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya. - `--redo_inline_math`: If you want the absolute highest quality inline math conversion, use this along with `--use_llm`. - `--disable_image_extraction`: Don't extract images from the PDF. If you also specify `--use_llm`, then images will be replaced with a description. - `--debug`: Enable debug mode for additional logging and diagnostic information. - `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"` - `--config_json PATH`: Path to a JSON configuration file containing additional settings. - `config --help`: List all available builders, processors, and converters, and their associated configuration. These values can be used to build a JSON configuration file for additional tweaking of marker defaults. - `--converter_cls`: One of `marker.converters.pdf.PdfConverter` (default) or `marker.converters.table.TableConverter`. The `PdfConverter` will convert the whole PDF, the `TableConverter` will only extract and convert tables. - `--llm_service`: Which llm service to use if `--use_llm` is passed. This defaults to `marker.services.gemini.GoogleGeminiService`. - `--help`: see all of the flags that can be passed into marker. (it supports many more options then are listed above) The list of supported languages for surya OCR is [here](https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py). If you don't need OCR, marker can work with any language. ## Convert multiple files ```shell marker /path/to/input/folder ``` - `marker` supports all the same options from `marker_single` above. - `--workers` is the number of conversion workers to run simultaneously. This is automatically set by default, but you can increase it to increase throughput, at the cost of more CPU/GPU usage. Marker will use 5GB of VRAM per worker at the peak, and 3.5GB average. ## Convert multiple files on multiple GPUs ```shell NUM_DEVICES=4 NUM_WORKERS=15 marker_chunk_convert ../pdf_in ../md_out ``` - `NUM_DEVICES` is the number of GPUs to use. Should be `2` or greater. - `NUM_WORKERS` is the number of parallel processes to run on each GPU. ## Use from python See the `PdfConverter` class at `marker/converters/pdf.py` function for additional arguments that can be passed. ```python from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.output import text_from_rendered converter = PdfConverter( artifact_dict=create_model_dict(), ) rendered = converter("FILEPATH") text, _, images = text_from_rendered(rendered) ``` `rendered` will be a pydantic basemodel with different properties depending on the output type requested. With markdown output (default), you'll have the properties `markdown`, `metadata`, and `images`. For json output, you'll have `children`, `block_type`, and `metadata`. ### Custom configuration You can pass configuration using the `ConfigParser`. To see all available options, do `marker_single --help`. ```python from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.config.parser import ConfigParser config = { "output_format": "json", "ADDITIONAL_KEY": "VALUE" } config_parser = ConfigParser(config) converter = PdfConverter( config=config_parser.generate_config_dict(), artifact_dict=create_model_dict(), processor_list=config_parser.get_processors(), renderer=config_parser.get_renderer(), llm_service=config_parser.get_llm_service() ) rendered = converter("FILEPATH") ``` ### Extract blocks Each document consists of one or more pages. Pages contain blocks, which can themselves contain other blocks. It's possible to programmatically manipulate these blocks. Here's an example of extracting all forms from a document: ```python from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.schema import BlockTypes converter = PdfConverter( artifact_dict=create_model_dict(), ) document = converter.build_document("FILEPATH") forms = document.contained_blocks((BlockTypes.Form,)) ``` Look at the processors for more examples of extracting and manipulating blocks. ## Other converters You can also use other converters that define different conversion pipelines: ### Extract tables The `TableConverter` will only convert and extract tables: ```python from marker.converters.table import TableConverter from marker.models import create_model_dict from marker.output import text_from_rendered converter = TableConverter( artifact_dict=create_model_dict(), ) rendered = converter("FILEPATH") text, _, images = text_from_rendered(rendered) ``` This takes all the same configuration as the PdfConverter. You can specify the configuration `force_layout_block=Table` to avoid layout detection and instead assume every page is a table. Set `output_format=json` to also get cell bounding boxes. You can also run this via the CLI with ```shell marker_single FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter --output_format json ``` ### OCR Only If you only want to run OCR, you can also do that through the `OCRConverter`. Set `--keep_chars` to keep individual characters and bounding boxes. ```python from marker.converters.ocr import OCRConverter from marker.models import create_model_dict converter = OCRConverter( artifact_dict=create_model_dict(), ) rendered = converter("FILEPATH") ``` This takes all the same configuration as the PdfConverter. You can also run this via the CLI with ```shell marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter ``` ### Structured Extraction (beta) You can run structured extraction via the `ExtractionConverter`. This requires an llm service to be setup first (see [here](#llm-services) for details). You'll get a JSON output with the extracted values. ```python from marker.converters.extraction import ExtractionConverter from marker.models import create_model_dict from marker.config.parser import ConfigParser from pydantic import BaseModel class Links(BaseModel): links: list[str] schema = Links.model_json_schema() config_parser = ConfigParser({ "page_schema": schema }) converter = ExtractionConverter( artifact_dict=create_model_dict(), config=config_parser.generate_config_dict(), llm_service=config_parser.get_llm_service(), ) rendered = converter("FILEPATH") ``` Rendered will have an `original_markdown` field. If you pass this back in next time you run the converter, as the `existing_markdown` config key, you can skip re-parsing the document. # Output Formats ## Markdown Markdown output will include: - image links (images will be saved in the same folder) - formatted tables - embedded LaTeX equations (fenced with `$$`) - Code is fenced with triple backticks - Superscripts for footnotes ## HTML HTML output is similar to markdown output: - Images are included via `img` tags - equations are fenced with `<math>` tags - code is in `pre` tags ## JSON JSON output will be organized in a tree-like structure, with the leaf nodes being blocks. Examples of leaf nodes are a single list item, a paragraph of text, or an image. The output will be a list, with each list item representing a page. Each page is considered a block in the internal marker schema. There are different types of blocks to represent different elements. Pages have the keys: - `id` - unique id for the block. - `block_type` - the type of block. The possible block types can be seen in `marker/schema/__init__.py`. As of this writing, they are ["Line", "Span", "FigureGroup", "TableGroup", "ListGroup", "PictureGroup", "Page", "Caption", "Code", "Figure", "Footnote", "Form", "Equation", "Handwriting", "TextInlineMath", "ListItem", "PageFooter", "PageHeader", "Picture", "SectionHeader", "Table", "Text", "TableOfContents", "Document"] - `html` - the HTML for the page. Note that this will have recursive references to children. The `content-ref` tags must be replaced with the child content if you want the full html. You can see an example of this at `marker/output.py:json_to_html`. That function will take in a single block from the json output, and turn it into HTML. - `polygon` - the 4-corner polygon of the page, in (x1,y1), (x2,y2), (x3, y3), (x4, y4) format. (x1,y1) is the top left, and coordinates go clockwise. - `children` - the child blocks. The child blocks have two additional keys: - `section_hierarchy` - indicates the sections that the block is part of. `1` indicates an h1 tag, `2` an h2, and so on. - `images` - base64 encoded images. The key will be the block id, and the data will be the encoded image. Note that child blocks of pages can have their own children as well (a tree structure). ```json { "id": "/page/10/Page/366", "block_type": "Page", "html": "<content-ref src='/page/10/SectionHeader/0'></content-ref><content-ref src='/page/10/SectionHeader/1'></content-ref><content-ref src='/page/10/Text/2'></content-ref><content-ref src='/page/10/Text/3'></content-ref><content-ref src='/page/10/Figure/4'></content-ref><content-ref src='/page/10/SectionHeader/5'></content-ref><content-ref src='/page/10/SectionHeader/6'></content-ref><content-ref src='/page/10/TextInlineMath/7'></content-ref><content-ref src='/page/10/TextInlineMath/8'></content-ref><content-ref src='/page/10/Table/9'></content-ref><content-ref src='/page/10/SectionHeader/10'></content-ref><content-ref src='/page/10/Text/11'></content-ref>", "polygon": [[0.0, 0.0], [612.0, 0.0], [612.0, 792.0], [0.0, 792.0]], "children": [ { "id": "/page/10/SectionHeader/0", "block_type": "SectionHeader", "html": "<h1>Supplementary Material for <i>Subspace Adversarial Training</i> </h1>", "polygon": [ [217.845703125, 80.630859375], [374.73046875, 80.630859375], [374.73046875, 107.0], [217.845703125, 107.0] ], "children": null, "section_hierarchy": { "1": "/page/10/SectionHeader/1" }, "images": {} }, ... ] } ``` ## Chunks Chunks format is similar to JSON, but flattens everything into a single list instead of a tree. Only the top level blocks from each page show up. It also has the full HTML of each block inside, so you don't need to crawl the tree to reconstruct it. This enable flexible and easy chunking for RAG. ## Metadata All output formats will return a metadata dictionary, with the following fields: ```json { "table_of_contents": [ { "title": "Introduction", "heading_level": 1, "page_id": 0, "polygon": [...] } ], // computed PDF table of contents "page_stats": [ { "page_id": 0, "text_extraction_method": "pdftext", "block_counts": [("Span", 200), ...] }, ... ] } ``` # LLM Services When running with the `--use_llm` flag, you have a choice of services you can use: - `Gemini` - this will use the Gemini developer API by default. You'll need to pass `--gemini_api_key` to configuration. - `Google Vertex` - this will use vertex, which can be more reliable. You'll need to pass `--vertex_project_id`. To use it, set `--llm_service=marker.services.vertex.GoogleVertexService`. - `Ollama` - this will use local models. You can configure `--ollama_base_url` and `--ollama_model`. To use it, set `--llm_service=marker.services.ollama.OllamaService`. - `Claude` - this will use the anthropic API. You can configure `--claude_api_key`, and `--claude_model_name`. To use it, set `--llm_service=marker.services.claude.ClaudeService`. - `OpenAI` - this supports any openai-like endpoint. You can configure `--openai_api_key`, `--openai_model`, and `--openai_base_url`. To use it, set `--llm_service=marker.services.openai.OpenAIService`. - `Azure OpenAI` - this uses the Azure OpenAI service. You can configure `--azure_endpoint`, `--azure_api_key`, and `--deployment_name`. To use it, set `--llm_service=marker.services.azure_openai.AzureOpenAIService`. These services may have additional optional configuration as well - you can see it by viewing the classes. # Internals Marker is easy to extend. The core units of marker are: - `Providers`, at `marker/providers`. These provide information from a source file, like a PDF. - `Builders`, at `marker/builders`. These generate the initial document blocks and fill in text, using info from the providers. - `Processors`, at `marker/processors`. These process specific blocks, for example the table formatter is a processor. - `Renderers`, at `marker/renderers`. These use the blocks to render output. - `Schema`, at `marker/schema`. The classes for all the block types. - `Converters`, at `marker/converters`. They run the whole end to end pipeline. To customize processing behavior, override the `processors`. To add new output formats, write a new `renderer`. For additional input formats, write a new `provider.` Processors and renderers can be directly passed into the base `PDFConverter`, so you can specify your own custom processing easily. ## API server There is a very simple API server you can run like this: ```shell pip install -U uvicorn fastapi python-multipart marker_server --port 8001 ``` This will start a fastapi server that you can access at `localhost:8001`. You can go to `localhost:8001/docs` to see the endpoint options. You can send requests like this: ``` import requests import json post_data = { 'filepath': 'FILEPATH', # Add other params here } requests.post("http://localhost:8001/marker", data=json.dumps(post_data)).json() ``` Note that this is not a very robust API, and is only intended for small-scale use. If you want to use this server, but want a more robust conversion option, you can use the hosted [Datalab API](https://www.datalab.to/plans). # Troubleshooting There are some settings that you may find useful if things aren't working the way you expect: - If you have issues with accuracy, try setting `--use_llm` to use an LLM to improve quality. You must set `GOOGLE_API_KEY` to a Gemini API key for this to work. - Make sure to set `force_ocr` if you see garbled text - this will re-OCR the document. - `TORCH_DEVICE` - set this to force marker to use a given torch device for inference. - If you're getting out of memory errors, decrease worker count. You can also try splitting up long PDFs into multiple files. ## Debugging Pass the `debug` option to activate debug mode. This will save images of each page with detected layout and text, as well as output a json file with additional bounding box information. # Benchmarks ## Overall PDF Conversion We created a [benchmark set](https://huggingface.co/datasets/datalab-to/marker_benchmark) by extracting single PDF pages from common crawl. We scored based on a heuristic that aligns text with ground truth text segments, and an LLM as a judge scoring method. | Method | Avg Time | Heuristic Score | LLM Score | |------------|----------|-----------------|-----------| | marker | 2.83837 | 95.6709 | 4.23916 | | llamaparse | 23.348 | 84.2442 | 3.97619 | | mathpix | 6.36223 | 86.4281 | 4.15626 | | docling | 3.69949 | 86.7073 | 3.70429 | Benchmarks were run on an H100 for markjer and docling - llamaparse and mathpix used their cloud services. We can also look at it by document type: <img src="data/images/per_doc.png" width="1000px"/> | Document Type | Marker heuristic | Marker LLM | Llamaparse Heuristic | Llamaparse LLM | Mathpix Heuristic | Mathpix LLM | Docling Heuristic | Docling LLM | |----------------------|------------------|------------|----------------------|----------------|-------------------|-------------|-------------------|-------------| | Scientific paper | 96.6737 | 4.34899 | 87.1651 | 3.96421 | 91.2267 | 4.46861 | 92.135 | 3.72422 | | Book page | 97.1846 | 4.16168 | 90.9532 | 4.07186 | 93.8886 | 4.35329 | 90.0556 | 3.64671 | | Other | 95.1632 | 4.25076 | 81.1385 | 4.01835 | 79.6231 | 4.00306 | 83.8223 | 3.76147 | | Form | 88.0147 | 3.84663 | 66.3081 | 3.68712 | 64.7512 | 3.33129 | 68.3857 | 3.40491 | | Presentation | 95.1562 | 4.13669 | 81.2261 | 4 | 83.6737 | 3.95683 | 84.8405 | 3.86331 | | Financial document | 95.3697 | 4.39106 | 82.5812 | 4.16111 | 81.3115 | 4.05556 | 86.3882 | 3.8 | | Letter | 98.4021 | 4.5 | 93.4477 | 4.28125 | 96.0383 | 4.45312 | 92.0952 | 4.09375 | | Engineering document | 93.9244 | 4.04412 | 77.4854 | 3.72059 | 80.3319 | 3.88235 | 79.6807 | 3.42647 | | Legal document | 96.689 | 4.27759 | 86.9769 | 3.87584 | 91.601 | 4.20805 | 87.8383 | 3.65552 | | Newspaper page | 98.8733 | 4.25806 | 84.7492 | 3.90323 | 96.9963 | 4.45161 | 92.6496 | 3.51613 | | Magazine page | 98.2145 | 4.38776 | 87.2902 | 3.97959 | 93.5934 | 4.16327 | 93.0892 | 4.02041 | ## Throughput We benchmarked throughput using a [single long PDF](https://www.greenteapress.com/thinkpython/thinkpython.pdf). | Method | Time per page | Time per document | VRAM used | |---------|---------------|-------------------|---------- | | marker | 0.18 | 43.42 | 3.17GB | The projected throughput is 122 pages per second on an H100 - we can run 22 individual processes given the VRAM used. ## Table Conversion Marker can extract tables from PDFs using `marker.converters.table.TableConverter`. The table extraction performance is measured by comparing the extracted HTML representation of tables against the original HTML representations using the test split of [FinTabNet](https://developer.ibm.com/exchanges/data/all/fintabnet/). The HTML representations are compared using a tree edit distance based metric to judge both structure and content. Marker detects and identifies the structure of all tables in a PDF page and achieves these scores: | Method | Avg score | Total tables | |------------------|-----------|--------------| | marker | 0.816 | 99 | | marker w/use_llm | 0.907 | 99 | | gemini | 0.829 | 99 | The `--use_llm` flag can significantly improve table recognition performance, as you can see. We filter out tables that we cannot align with the ground truth, since fintabnet and our layout model have slightly different detection methods (this results in some tables being split/merged). ## Running your own benchmarks You can benchmark the performance of marker on your machine. Install marker manually with: ```shell git clone https://github.com/VikParuchuri/marker.git poetry install ``` ### Overall PDF Conversion Download the benchmark data [here](https://drive.google.com/file/d/1ZSeWDo2g1y0BRLT7KnbmytV2bjWARWba/view?usp=sharing) and unzip. Then run the overall benchmark like this: ```shell python benchmarks/overall.py --methods marker --scores heuristic,llm ``` Options: - `--use_llm` use an llm to improve the marker results. - `--max_rows` how many rows to process for the benchmark. - `--methods` can be `llamaparse`, `mathpix`, `docling`, `marker`. Comma separated. - `--scores` which scoring functions to use, can be `llm`, `heuristic`. Comma separated. ### Table Conversion The processed FinTabNet dataset is hosted [here](https://huggingface.co/datasets/datalab-to/fintabnet-test) and is automatically downloaded. Run the benchmark with: ```shell python benchmarks/table/table.py --max_rows 100 ``` Options: - `--use_llm` uses an llm with marker to improve accuracy. - `--use_gemini` also benchmarks gemini 2.0 flash. # How it works Marker is a pipeline of deep learning models: - Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya)) - Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya)) - Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify), [surya](https://github.com/VikParuchuri/surya)) - Optionally use an LLM to improve quality - Combine blocks and postprocess complete text It only uses models where necessary, which improves speed and accuracy. # Limitations PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address: - Very complex layouts, with nested tables and forms, may not work - Forms may not be rendered well Note: Passing the `--use_llm` and `--force_ocr` flags will mostly solve these issues. # Usage and Deployment Examples You can always run `marker` locally, but if you wanted to expose it as an API, we have a few options: - Our platform API which is powered by `marker` and `surya` and is easy to test out - it's free to sign up, and we'll include credits, [try it out here](https://datalab.to) - Our painless on-prem solution for commercial use, which you can [read about here](https://www.datalab.to/blog/self-serve-on-prem-licensing) and gives you privacy guarantees with high throughput inference optimizations. - [Deployment example with Modal](./examples/README_MODAL.md) that shows you how to deploy and access `marker` through a web endpoint using [`Modal`](https://modal.com). Modal is an AI compute platform that enables developers to deploy and scale models on GPUs in minutes. ``` -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- ```python ``` -------------------------------------------------------------------------------- /benchmarks/overall/__init__.py: -------------------------------------------------------------------------------- ```python ``` -------------------------------------------------------------------------------- /benchmarks/overall/display/__init__.py: -------------------------------------------------------------------------------- ```python ``` -------------------------------------------------------------------------------- /benchmarks/overall/download/__init__.py: -------------------------------------------------------------------------------- ```python ``` -------------------------------------------------------------------------------- /benchmarks/table/__init__.py: -------------------------------------------------------------------------------- ```python ``` -------------------------------------------------------------------------------- /benchmarks/throughput/__init__.py: -------------------------------------------------------------------------------- ```python ``` -------------------------------------------------------------------------------- /marker/config/__init__.py: -------------------------------------------------------------------------------- ```python ``` -------------------------------------------------------------------------------- /marker/scripts/__init__.py: -------------------------------------------------------------------------------- ```python ``` -------------------------------------------------------------------------------- /marker/utils/__init__.py: -------------------------------------------------------------------------------- ```python ``` -------------------------------------------------------------------------------- /marker/schema/groups/base.py: -------------------------------------------------------------------------------- ```python from marker.schema.blocks import Block class Group(Block): pass ``` -------------------------------------------------------------------------------- /marker/schema/text/__init__.py: -------------------------------------------------------------------------------- ```python from marker.schema.text.line import Line from marker.schema.text.span import Span ``` -------------------------------------------------------------------------------- /marker_server.py: -------------------------------------------------------------------------------- ```python from marker.scripts.server import server_cli if __name__ == "__main__": server_cli() ``` -------------------------------------------------------------------------------- /convert.py: -------------------------------------------------------------------------------- ```python from marker.scripts.convert import convert_cli if __name__ == "__main__": convert_cli() ``` -------------------------------------------------------------------------------- /chunk_convert.py: -------------------------------------------------------------------------------- ```python from marker.scripts.chunk_convert import chunk_convert_cli if __name__ == "__main__": chunk_convert_cli() ``` -------------------------------------------------------------------------------- /convert_single.py: -------------------------------------------------------------------------------- ```python from marker.scripts.convert_single import convert_single_cli if __name__ == "__main__": convert_single_cli() ``` -------------------------------------------------------------------------------- /marker_app.py: -------------------------------------------------------------------------------- ```python from marker.scripts.run_streamlit_app import streamlit_app_cli if __name__ == "__main__": streamlit_app_cli() ``` -------------------------------------------------------------------------------- /extraction_app.py: -------------------------------------------------------------------------------- ```python from marker.scripts.run_streamlit_app import extraction_app_cli if __name__ == "__main__": extraction_app_cli() ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/schema.py: -------------------------------------------------------------------------------- ```python from typing import TypedDict, List class BenchmarkResult(TypedDict): markdown: str | List[str] time: float | None ``` -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- ``` [pytest] testpaths=tests markers = filename(name): specify the filename for the pdf_document fixture filterwarnings = ignore::Warning ``` -------------------------------------------------------------------------------- /benchmarks/overall/scorers/schema.py: -------------------------------------------------------------------------------- ```python from typing import TypedDict, List, Optional, Dict class BlockScores(TypedDict): score: float specific_scores: Dict[str, float | List[float]] ``` -------------------------------------------------------------------------------- /marker/schema/blocks/toc.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks.basetable import BaseTable class TableOfContents(BaseTable): block_type: str = BlockTypes.TableOfContents block_description: str = "A table of contents." ``` -------------------------------------------------------------------------------- /marker/schema/text/char.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks import Block class Char(Block): block_type: BlockTypes = BlockTypes.Char block_description: str = "A single character inside a span." text: str idx: int ``` -------------------------------------------------------------------------------- /marker/providers/utils.py: -------------------------------------------------------------------------------- ```python def alphanum_ratio(text): text = text.replace(" ", "") text = text.replace("\n", "") alphanumeric_count = sum([1 for c in text if c.isalnum()]) if len(text) == 0: return 1 ratio = alphanumeric_count / len(text) return ratio ``` -------------------------------------------------------------------------------- /marker/schema/blocks/table.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks.basetable import BaseTable class Table(BaseTable): block_type: BlockTypes = BlockTypes.Table block_description: str = "A table of data, like a results table. It will be in a tabular format." ``` -------------------------------------------------------------------------------- /benchmarks/overall/scorers/__init__.py: -------------------------------------------------------------------------------- ```python from typing import List from benchmarks.overall.scorers.schema import BlockScores class BaseScorer: def __init__(self): pass def __call__(self, sample, gt_markdown: List[str], method_markdown: str) -> BlockScores: raise NotImplementedError() ``` -------------------------------------------------------------------------------- /tests/builders/test_structure.py: -------------------------------------------------------------------------------- ```python import pytest from marker.builders.structure import StructureBuilder @pytest.mark.config({"page_range": [0]}) def test_structure_builder(pdf_document): structure = StructureBuilder() structure(pdf_document) assert len(pdf_document.pages[0].structure) > 0 ``` -------------------------------------------------------------------------------- /marker/schema/groups/__init__.py: -------------------------------------------------------------------------------- ```python from marker.schema.blocks.base import Block from marker.schema.groups.figure import FigureGroup from marker.schema.groups.table import TableGroup from marker.schema.groups.list import ListGroup from marker.schema.groups.picture import PictureGroup from marker.schema.groups.page import PageGroup ``` -------------------------------------------------------------------------------- /marker/builders/__init__.py: -------------------------------------------------------------------------------- ```python from typing import Optional from pydantic import BaseModel from marker.util import assign_config class BaseBuilder: def __init__(self, config: Optional[BaseModel | dict] = None): assign_config(self, config) def __call__(self, data, *args, **kwargs): raise NotImplementedError ``` -------------------------------------------------------------------------------- /tests/builders/test_ocr_builder.py: -------------------------------------------------------------------------------- ```python from PIL import Image from marker.builders.ocr import OcrBuilder def test_blank_char_builder(recognition_model): builder = OcrBuilder(recognition_model) image = Image.new("RGB", (100, 100)) spans = builder.spans_from_html_chars([], None, image) # Test with empty char list assert len(spans) == 0 ``` -------------------------------------------------------------------------------- /marker/schema/blocks/form.py: -------------------------------------------------------------------------------- ```python from typing import List from marker.schema import BlockTypes from marker.schema.blocks.basetable import BaseTable class Form(BaseTable): block_type: BlockTypes = BlockTypes.Form block_description: str = "A form, such as a tax form, that contains fields and labels. It most likely doesn't have a table structure." ``` -------------------------------------------------------------------------------- /tests/renderers/test_json_renderer.py: -------------------------------------------------------------------------------- ```python import pytest from marker.renderers.json import JSONRenderer @pytest.mark.config({"page_range": [0]}) def test_markdown_renderer_pagination(pdf_document): renderer = JSONRenderer() pages = renderer(pdf_document).children assert len(pages) == 1 assert pages[0].block_type == "Page" assert pages[0].children[0].block_type == "SectionHeader" ``` -------------------------------------------------------------------------------- /benchmarks/overall/schema.py: -------------------------------------------------------------------------------- ```python from typing import TypedDict, List, Dict from benchmarks.overall.scorers.schema import BlockScores AVG_TYPE = Dict[str, Dict[str, Dict[str, List[float]]]] class FullResult(TypedDict): scores: Dict[int, Dict[str, Dict[str, BlockScores]]] averages_by_type: AVG_TYPE averages_by_block_type: AVG_TYPE average_times: Dict[str, List[float]] markdown: Dict[int, Dict[str, str]] ``` -------------------------------------------------------------------------------- /tests/builders/test_strip_existing_ocr.py: -------------------------------------------------------------------------------- ```python import pytest @pytest.mark.config({"page_range": [0], "strip_existing_ocr": True}) @pytest.mark.filename("handwritten.pdf") def test_strip_ocr(doc_provider): # Ensure that the OCR text isn't extracted assert len(doc_provider.page_lines) == 0 @pytest.mark.config({"page_range": [0]}) @pytest.mark.filename("handwritten.pdf") def test_keep_ocr(doc_provider): assert len(doc_provider.page_lines) == 1 ``` -------------------------------------------------------------------------------- /tests/renderers/test_html_renderer.py: -------------------------------------------------------------------------------- ```python import pytest from marker.renderers.html import HTMLRenderer @pytest.mark.config( { "page_range": [0], "disable_ocr": True, "add_block_ids": True, "paginate_output": True, } ) def test_html_renderer_block_ids(pdf_document, config): renderer = HTMLRenderer(config) html = renderer(pdf_document).html # Verify some block IDs are present assert "/page/0/Text/1" in html ``` -------------------------------------------------------------------------------- /tests/processors/test_document_toc_processor.py: -------------------------------------------------------------------------------- ```python import pytest from marker.processors.document_toc import DocumentTOCProcessor @pytest.mark.config({"page_range": [0]}) def test_document_toc_processor(pdf_document, detection_model, recognition_model, table_rec_model): processor = DocumentTOCProcessor() processor(pdf_document) assert len(pdf_document.table_of_contents) == 4 assert pdf_document.table_of_contents[0]["title"] == "Subspace Adversarial Training" ``` -------------------------------------------------------------------------------- /tests/processors/test_equation_processor.py: -------------------------------------------------------------------------------- ```python import pytest from marker.schema import BlockTypes from marker.processors.equation import EquationProcessor @pytest.mark.config({"page_range": [0]}) def test_equation_processor(pdf_document, recognition_model): processor = EquationProcessor(recognition_model) processor(pdf_document) for block in pdf_document.pages[0].children: if block.block_type == BlockTypes.Equation: assert block.html is not None ``` -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- ```python from marker.providers.pdf import PdfProvider import tempfile import datasets def setup_pdf_provider( filename='adversarial.pdf', config=None, ) -> PdfProvider: dataset = datasets.load_dataset("datalab-to/pdfs", split="train") idx = dataset['filename'].index(filename) temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf") temp_pdf.write(dataset['pdf'][idx]) temp_pdf.flush() provider = PdfProvider(temp_pdf.name, config) return provider ``` -------------------------------------------------------------------------------- /marker/schema/blocks/code.py: -------------------------------------------------------------------------------- ```python import html from marker.schema import BlockTypes from marker.schema.blocks import Block class Code(Block): block_type: BlockTypes = BlockTypes.Code code: str | None = None html: str | None = None block_description: str = "A programming code block." def assemble_html(self, document, child_blocks, parent_structure, block_config): if self.html: return self.html code = self.code or "" return f"<pre>{html.escape(code)}</pre>" ``` -------------------------------------------------------------------------------- /marker/processors/__init__.py: -------------------------------------------------------------------------------- ```python from typing import Optional, Tuple from pydantic import BaseModel from marker.schema import BlockTypes from marker.schema.document import Document from marker.util import assign_config class BaseProcessor: block_types: Tuple[BlockTypes] | None = None # What block types this processor is responsible for def __init__(self, config: Optional[BaseModel | dict] = None): assign_config(self, config) def __call__(self, document: Document, *args, **kwargs): raise NotImplementedError ``` -------------------------------------------------------------------------------- /tests/processors/test_footnote_processor.py: -------------------------------------------------------------------------------- ```python import pytest from marker.processors.footnote import FootnoteProcessor from marker.schema import BlockTypes @pytest.mark.filename("population_stats.pdf") @pytest.mark.config({"page_range": [4]}) def test_footnote_processor(pdf_document): processor = FootnoteProcessor() processor(pdf_document) page0_footnotes = pdf_document.pages[0].contained_blocks(pdf_document, [BlockTypes.Footnote]) assert len(page0_footnotes) >= 2 assert page0_footnotes[-1].raw_text(pdf_document).strip().startswith("5") ``` -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- ```markdown --- name: Feature request about: Suggest an idea for this project title: "[FEAT]" labels: enhancement assignees: '' --- ## ✨ Is your feature request related to a problem? A clear and concise description of what the problem is. ## 💡 Describe the Solution You'd Like A concise description of what you want to happen or how you envision it working. ## 📋 Alternatives Considered Any alternative solutions or workarounds you've tried. ## 🧩 Additional Context Any additional context, references, or related issues. ``` -------------------------------------------------------------------------------- /marker/schema/blocks/reference.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks import Block class Reference(Block): block_type: BlockTypes = BlockTypes.Reference ref: str block_description: str = "A reference to this block from another block." def assemble_html( self, document, child_blocks, parent_structure=None, block_config=None ): template = super().assemble_html( document, child_blocks, parent_structure, block_config ) return f"<span id='{self.ref}'>{template}</span>" ``` -------------------------------------------------------------------------------- /marker/utils/batch.py: -------------------------------------------------------------------------------- ```python from marker.utils.gpu import GPUManager def get_batch_sizes_worker_counts(gpu_manager: GPUManager, peak_worker_vram: int): vram = gpu_manager.get_gpu_vram() workers = max(1, vram // peak_worker_vram) if workers == 1: return {}, workers return { "layout_batch_size": 12, "detection_batch_size": 8, "table_rec_batch_size": 12, "ocr_error_batch_size": 12, "recognition_batch_size": 64, "equation_batch_size": 16, "detector_postprocessing_cpu_workers": 2, }, workers ``` -------------------------------------------------------------------------------- /tests/providers/test_pdf_provider.py: -------------------------------------------------------------------------------- ```python import pytest @pytest.mark.config({"page_range": [0]}) def test_pdf_provider(doc_provider): assert len(doc_provider) == 12 assert doc_provider.get_images([0], 72)[0].size == (612, 792) assert doc_provider.get_images([0], 96)[0].size == (816, 1056) page_lines = doc_provider.get_page_lines(0) assert len(page_lines) == 85 spans = page_lines[0].spans assert len(spans) == 2 assert spans[0].text == "Subspace Adversarial Training" assert spans[0].font == "NimbusRomNo9L-Medi" assert spans[0].formats == ["plain"] ``` -------------------------------------------------------------------------------- /tests/providers/test_image_provider.py: -------------------------------------------------------------------------------- ```python from marker.providers.image import ImageProvider from marker.renderers.markdown import MarkdownOutput def test_image_provider(config, temp_image): provider = ImageProvider(temp_image.name, config) assert len(provider) == 1 assert provider.get_images([0], 72)[0].size == (512, 512) page_lines = provider.get_page_lines(0) assert len(page_lines) == 0 def test_image_provider_conversion(pdf_converter, temp_image): markdown_output: MarkdownOutput = pdf_converter(temp_image.name) assert "Hello, World!" in markdown_output.markdown ``` -------------------------------------------------------------------------------- /marker/schema/groups/picture.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.groups.base import Group class PictureGroup(Group): block_type: BlockTypes = BlockTypes.PictureGroup block_description: str = "A picture along with associated captions." html: str | None = None def assemble_html( self, document, child_blocks, parent_structure, block_config=None ): if self.html: return self.html child_html = super().assemble_html( document, child_blocks, parent_structure, block_config ) return child_html ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/mathpix.py: -------------------------------------------------------------------------------- ```python import datasets from benchmarks.overall.methods import BaseMethod, BenchmarkResult class MathpixMethod(BaseMethod): mathpix_ds: datasets.Dataset = None def __call__(self, sample) -> BenchmarkResult: uuid = sample["uuid"] data = None for row in self.mathpix_ds: if str(row["uuid"]) == str(uuid): data = row break if not data: raise ValueError(f"Could not find data for uuid {uuid}") return { "markdown": data["md"], "time": data["time"] } ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/mistral.py: -------------------------------------------------------------------------------- ```python import datasets from benchmarks.overall.methods import BaseMethod, BenchmarkResult class MistralMethod(BaseMethod): mistral_ds: datasets.Dataset = None def __call__(self, sample) -> BenchmarkResult: uuid = sample["uuid"] data = None for row in self.mistral_ds: if str(row["uuid"]) == str(uuid): data = row break if not data: raise ValueError(f"Could not find data for uuid {uuid}") return { "markdown": data["md"], "time": data["time"] } ``` -------------------------------------------------------------------------------- /marker/scripts/run_streamlit_app.py: -------------------------------------------------------------------------------- ```python import subprocess import os import sys def streamlit_app_cli(app_name: str = "streamlit_app.py"): argv = sys.argv[1:] cur_dir = os.path.dirname(os.path.abspath(__file__)) app_path = os.path.join(cur_dir, app_name) cmd = [ "streamlit", "run", app_path, "--server.fileWatcherType", "none", "--server.headless", "true", ] if argv: cmd += ["--"] + argv subprocess.run(cmd, env={**os.environ, "IN_STREAMLIT": "true"}) def extraction_app_cli(): streamlit_app_cli("extraction_app.py") ``` -------------------------------------------------------------------------------- /marker/schema/groups/figure.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.groups.base import Group class FigureGroup(Group): block_type: BlockTypes = BlockTypes.FigureGroup block_description: str = "A group that contains a figure and associated captions." html: str | None = None def assemble_html( self, document, child_blocks, parent_structure, block_config=None ): if self.html: return self.html child_html = super().assemble_html( document, child_blocks, parent_structure, block_config ) return child_html ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/llamaparse.py: -------------------------------------------------------------------------------- ```python import datasets from benchmarks.overall.methods import BaseMethod, BenchmarkResult class LlamaParseMethod(BaseMethod): llamaparse_ds: datasets.Dataset = None def __call__(self, sample) -> BenchmarkResult: uuid = sample["uuid"] data = None for row in self.llamaparse_ds: if str(row["uuid"]) == str(uuid): data = row break if not data: raise ValueError(f"Could not find data for uuid {uuid}") return { "markdown": data["md"], "time": data["time"] } ``` -------------------------------------------------------------------------------- /marker/schema/blocks/handwriting.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks import Block class Handwriting(Block): block_type: BlockTypes = BlockTypes.Handwriting block_description: str = "A region that contains handwriting." html: str | None = None replace_output_newlines: bool = True def assemble_html( self, document, child_blocks, parent_structure, block_config=None ): if self.html: return self.html else: return super().assemble_html( document, child_blocks, parent_structure, block_config ) ``` -------------------------------------------------------------------------------- /tests/schema/groups/test_list_grouping.py: -------------------------------------------------------------------------------- ```python import pytest from marker.builders.structure import StructureBuilder from marker.schema import BlockTypes @pytest.mark.config({"page_range": [4]}) @pytest.mark.skip(reason="Model breaks this up due to equations") def test_list_grouping(pdf_document): structure = StructureBuilder() structure(pdf_document) page = pdf_document.pages[0] list_groups = [] for block in page.children: if block.block_type == BlockTypes.ListGroup: list_groups.append(block) # The model breaks this up, since it has equations in it assert len(list_groups) == 3 ``` -------------------------------------------------------------------------------- /tests/renderers/test_extract_images.py: -------------------------------------------------------------------------------- ```python import pytest from marker.renderers.markdown import MarkdownRenderer @pytest.mark.config({"page_range": [0]}) @pytest.mark.filename("A17_FlightPlan.pdf") def test_disable_extract_images(pdf_document): renderer = MarkdownRenderer({"extract_images": False}) md = renderer(pdf_document).markdown # Verify markdown assert "jpeg" not in md @pytest.mark.config({"page_range": [0]}) @pytest.mark.filename("A17_FlightPlan.pdf") def test_extract_images(pdf_document): renderer = MarkdownRenderer() md = renderer(pdf_document).markdown # Verify markdown assert "jpeg" in md ``` -------------------------------------------------------------------------------- /marker/renderers/extraction.py: -------------------------------------------------------------------------------- ```python from pydantic import BaseModel from marker.extractors.document import DocumentExtractionSchema from marker.renderers import BaseRenderer class ExtractionOutput(BaseModel): analysis: str document_json: str original_markdown: str class ExtractionRenderer(BaseRenderer): def __call__( self, output: DocumentExtractionSchema, markdown: str ) -> ExtractionOutput: # We definitely want to do more complex stuff here soon, so leave it in return ExtractionOutput( analysis=output.analysis, document_json=output.document_json, original_markdown=markdown, ) ``` -------------------------------------------------------------------------------- /marker/scripts/chunk_convert.py: -------------------------------------------------------------------------------- ```python import argparse import os import subprocess import pkg_resources def chunk_convert_cli(): parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.") parser.add_argument("in_folder", help="Input folder with pdfs.") parser.add_argument("out_folder", help="Output folder") args = parser.parse_args() cur_dir = os.path.dirname(os.path.abspath(__file__)) script_path = os.path.join(cur_dir, "chunk_convert.sh") # Construct the command cmd = f"{script_path} {args.in_folder} {args.out_folder}" # Execute the shell script subprocess.run(cmd, shell=True, check=True) ``` -------------------------------------------------------------------------------- /tests/processors/test_ignoretext.py: -------------------------------------------------------------------------------- ```python import pytest from marker.processors.ignoretext import IgnoreTextProcessor from marker.schema import BlockTypes @pytest.mark.filename("bio_pdf.pdf") @pytest.mark.config({"page_range": list(range(10))}) @pytest.mark.skip(reason="New layout model correctly identifies the block as a PageHeader, so nothing to be done by the IgnoreTextProcessor") def test_ignoretext_processor(pdf_document): processor = IgnoreTextProcessor() processor(pdf_document) page1_header = pdf_document.pages[1].contained_blocks(pdf_document, [BlockTypes.Text])[0] assert "bioRxiv" in page1_header.raw_text(pdf_document) assert page1_header.ignore_for_output is True ``` -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- ```yaml name: CI tests on: [push] jobs: tests: runs-on: t4_gpu steps: - uses: actions/checkout@v3 - name: Install apt requirements run: | sudo apt-get update sudo apt-get install -y libpango-1.0-0 libharfbuzz0b libpangoft2-1.0-0 libgdk-pixbuf2.0-0 libcairo2 libffi-dev shared-mime-info - name: Set up Python 3.11 uses: actions/setup-python@v4 with: python-version: 3.11 - name: Install python dependencies run: | pip install poetry poetry install --extras "full" - name: Run tests env: HF_TOKEN: ${{ secrets.HF_TOKEN }} run: poetry run pytest ``` -------------------------------------------------------------------------------- /marker/schema/blocks/footnote.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks import Block class Footnote(Block): block_type: BlockTypes = BlockTypes.Footnote block_description: str = ( "A footnote that explains a term or concept in the document." ) replace_output_newlines: bool = True html: str | None = None def assemble_html( self, document, child_blocks, parent_structure, block_config=None ): if self.html: return super().handle_html_output( document, child_blocks, parent_structure, block_config ) return super().assemble_html( document, child_blocks, parent_structure, block_config ) ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/gt.py: -------------------------------------------------------------------------------- ```python from typing import List import json from PIL import Image from benchmarks.overall.methods import BaseMethod, BenchmarkResult class GTMethod(BaseMethod): def __call__(self, sample) -> BenchmarkResult: gt_blocks = json.loads(sample["gt_blocks"]) gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0] gt_markdown = [self.convert_to_md(block) for block in gt_html] return { "markdown": gt_markdown, "time": 0 } def render(self, html: List[str]) -> Image.Image: joined = "\n\n".join(html) html = f""" <html> <head></head> <body> {joined} </body> </html> """.strip() return self.html_to_image(html) ``` -------------------------------------------------------------------------------- /marker/schema/blocks/caption.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks import Block class Caption(Block): block_type: BlockTypes = BlockTypes.Caption block_description: str = "A text caption that is directly above or below an image or table. Only used for text describing the image or table. " replace_output_newlines: bool = True html: str | None = None def assemble_html(self, document, child_blocks, parent_structure, block_config): if self.html: return super().handle_html_output( document, child_blocks, parent_structure, block_config ) return super().assemble_html( document, child_blocks, parent_structure, block_config ) ``` -------------------------------------------------------------------------------- /marker/schema/__init__.py: -------------------------------------------------------------------------------- ```python from enum import auto, Enum class BlockTypes(str, Enum): Line = auto() Span = auto() Char = auto() FigureGroup = auto() TableGroup = auto() ListGroup = auto() PictureGroup = auto() Page = auto() Caption = auto() Code = auto() Figure = auto() Footnote = auto() Form = auto() Equation = auto() Handwriting = auto() TextInlineMath = auto() ListItem = auto() PageFooter = auto() PageHeader = auto() Picture = auto() SectionHeader = auto() Table = auto() Text = auto() TableOfContents = auto() Document = auto() ComplexRegion = auto() TableCell = auto() Reference = auto() def __str__(self): return self.name ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/docling.py: -------------------------------------------------------------------------------- ```python import tempfile import time from benchmarks.overall.methods import BaseMethod, BenchmarkResult class DoclingMethod(BaseMethod): model_dict: dict = None use_llm: bool = False def __call__(self, sample) -> BenchmarkResult: from docling.document_converter import DocumentConverter pdf_bytes = sample["pdf"] # This is a single page PDF converter = DocumentConverter() with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: f.write(pdf_bytes) start = time.time() result = converter.convert(f.name) total = time.time() - start return { "markdown": result.document.export_to_markdown(), "time": total } ``` -------------------------------------------------------------------------------- /marker/schema/blocks/pagefooter.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks import Block class PageFooter(Block): block_type: str = BlockTypes.PageFooter block_description: str = ( "Text that appears at the bottom of a page, like a page number." ) replace_output_newlines: bool = True ignore_for_output: bool = True html: str | None = None def assemble_html(self, document, child_blocks, parent_structure, block_config): if block_config and block_config.get("keep_pagefooter_in_output"): self.ignore_for_output = False if self.html and not self.ignore_for_output: return self.html return super().assemble_html( document, child_blocks, parent_structure, block_config ) ``` -------------------------------------------------------------------------------- /marker/schema/blocks/pageheader.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks import Block class PageHeader(Block): block_type: BlockTypes = BlockTypes.PageHeader block_description: str = ( "Text that appears at the top of a page, like a page title." ) replace_output_newlines: bool = True ignore_for_output: bool = True html: str | None = None def assemble_html(self, document, child_blocks, parent_structure, block_config): if block_config and block_config.get("keep_pageheader_in_output"): self.ignore_for_output = False if self.html and not self.ignore_for_output: return self.html return super().assemble_html( document, child_blocks, parent_structure, block_config ) ``` -------------------------------------------------------------------------------- /marker/schema/groups/table.py: -------------------------------------------------------------------------------- ```python from typing import List from marker.schema import BlockTypes from marker.schema.blocks import BlockOutput from marker.schema.groups.base import Group class TableGroup(Group): block_type: BlockTypes = BlockTypes.TableGroup block_description: str = "A table along with associated captions." html: str | None = None def assemble_html( self, document, child_blocks: List[BlockOutput], parent_structure=None, block_config: dict | None = None, ): if self.html: return self.handle_html_output( document, child_blocks, parent_structure, block_config ) return super().assemble_html( document, child_blocks, parent_structure, block_config ) ``` -------------------------------------------------------------------------------- /tests/builders/test_rotated_bboxes.py: -------------------------------------------------------------------------------- ```python import pytest from marker.schema import BlockTypes @pytest.mark.config({"page_range": [0]}) @pytest.mark.filename("adversarial_rot.pdf") def test_rotated_bboxes(pdf_document): first_page = pdf_document.pages[0] # Ensure we match all text lines up properly text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,)) text_blocks = first_page.contained_blocks( pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath) ) # assert len(text_lines) == 84 # Ensure the bbox sizes match up max_line_position = max([line.polygon.x_end for line in text_lines]) max_block_position = max( [block.polygon.x_end for block in text_blocks if block.source == "layout"] ) assert max_line_position <= max_block_position ``` -------------------------------------------------------------------------------- /marker/processors/document_toc.py: -------------------------------------------------------------------------------- ```python from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.document import Document class DocumentTOCProcessor(BaseProcessor): """ A processor for generating a table of contents for the document. """ block_types = (BlockTypes.SectionHeader, ) def __call__(self, document: Document): toc = [] for page in document.pages: for block in page.contained_blocks(document, self.block_types): toc.append({ "title": block.raw_text(document).strip(), "heading_level": block.heading_level, "page_id": page.page_id, "polygon": block.polygon.polygon }) document.table_of_contents = toc ``` -------------------------------------------------------------------------------- /benchmarks/overall/registry.py: -------------------------------------------------------------------------------- ```python from benchmarks.overall.methods.docling import DoclingMethod from benchmarks.overall.methods.gt import GTMethod from benchmarks.overall.methods.llamaparse import LlamaParseMethod from benchmarks.overall.methods.marker import MarkerMethod from benchmarks.overall.methods.mathpix import MathpixMethod from benchmarks.overall.methods.mistral import MistralMethod from benchmarks.overall.methods.olmocr import OlmOCRMethod from benchmarks.overall.scorers.heuristic import HeuristicScorer from benchmarks.overall.scorers.llm import LLMScorer SCORE_REGISTRY = { "heuristic": HeuristicScorer, "llm": LLMScorer } METHOD_REGISTRY = { "marker": MarkerMethod, "gt": GTMethod, "mathpix": MathpixMethod, "llamaparse": LlamaParseMethod, "docling": DoclingMethod, "olmocr": OlmOCRMethod, "mistral": MistralMethod } ``` -------------------------------------------------------------------------------- /marker/processors/page_header.py: -------------------------------------------------------------------------------- ```python from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.document import Document from marker.schema.groups.page import PageGroup class PageHeaderProcessor(BaseProcessor): """ A processor for moving PageHeaders to the top """ block_types = (BlockTypes.PageHeader,) def __call__(self, document: Document): for page in document.pages: self.move_page_header_to_top(page, document) def move_page_header_to_top(self, page: PageGroup, document: Document): page_header_blocks = page.contained_blocks(document, self.block_types) page_header_block_ids = [block.id for block in page_header_blocks] for block_id in page_header_block_ids: page.structure.remove(block_id) page.structure[:0] = page_header_block_ids ``` -------------------------------------------------------------------------------- /marker/schema/groups/list.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.groups.base import Group class ListGroup(Group): block_type: BlockTypes = BlockTypes.ListGroup has_continuation: bool = False block_description: str = "A group of list items that should be rendered together." html: str | None = None def assemble_html( self, document, child_blocks, parent_structure, block_config=None ): if self.html: return self.handle_html_output( document, child_blocks, parent_structure, block_config ) template = super().assemble_html( document, child_blocks, parent_structure, block_config ) el_attr = f" block-type='{self.block_type}'" if self.has_continuation: el_attr += " class='has-continuation'" return f"<p{el_attr}><ul>{template}</ul></p>" ``` -------------------------------------------------------------------------------- /benchmarks/overall/download/main.py: -------------------------------------------------------------------------------- ```python import click from benchmarks.overall.download.llamaparse import LlamaParseDownloader from benchmarks.overall.download.mathpix import MathpixDownloader from benchmarks.overall.download.mistral import MistralDownloader @click.command("Download data from inference services") @click.argument("service", type=click.Choice(["mathpix", "llamaparse", "mistral"])) @click.option("--max_rows", type=int, default=2200) @click.option("--api_key", type=str, default=None) @click.option("--app_id", type=str, default=None) def main(service: str, max_rows: int, api_key: str, app_id: str): registry = { "mathpix": MathpixDownloader, "llamaparse": LlamaParseDownloader, "mistral": MistralDownloader, } downloader = registry[service](api_key, app_id, max_rows=max_rows) # Generate data and upload to hub downloader() if __name__ == "__main__": main() ``` -------------------------------------------------------------------------------- /marker/logger.py: -------------------------------------------------------------------------------- ```python import logging import warnings from marker.settings import settings def configure_logging(): # Setup marker logger logger = get_logger() if not logger.handlers: handler = logging.StreamHandler() formatter = logging.Formatter( "%(asctime)s [%(levelname)s] %(name)s: %(message)s" ) handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(settings.LOGLEVEL) # Ignore future warnings warnings.simplefilter(action="ignore", category=FutureWarning) # Set component loglevels logging.getLogger("PIL").setLevel(logging.ERROR) logging.getLogger("fontTools.subset").setLevel(logging.ERROR) logging.getLogger("fontTools.ttLib.ttFont").setLevel(logging.ERROR) logging.getLogger("weasyprint").setLevel(logging.CRITICAL) def get_logger(): return logging.getLogger("marker") ``` -------------------------------------------------------------------------------- /tests/processors/test_table_merge.py: -------------------------------------------------------------------------------- ```python from unittest.mock import Mock import pytest from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor from marker.processors.table import TableProcessor from marker.schema import BlockTypes @pytest.mark.filename("table_ex2.pdf") def test_llm_table_processor_nomerge(pdf_document, table_rec_model, recognition_model, detection_model, mocker): mock_cls = Mock() mock_cls.return_value = { "merge": "true", "direction": "right" } cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model) cell_processor(pdf_document) tables = pdf_document.contained_blocks((BlockTypes.Table,)) assert len(tables) == 3 processor = LLMTableMergeProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"}) processor(pdf_document) tables = pdf_document.contained_blocks((BlockTypes.Table,)) assert len(tables) == 3 ``` -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- ```yaml name: Python package on: push: tags: - "v*.*.*" jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: Set up Python 3.11 uses: actions/setup-python@v4 with: python-version: 3.11 - name: Install python dependencies run: | pip install poetry poetry install --extras "full" - name: Build package run: | poetry build - name: Extract version from pyproject.toml id: version run: | VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['tool']['poetry']['version'])") echo "version=v$VERSION" >> $GITHUB_OUTPUT - name: Publish package env: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} run: | poetry config pypi-token.pypi "$PYPI_TOKEN" poetry publish ``` -------------------------------------------------------------------------------- /marker/schema/blocks/sectionheader.py: -------------------------------------------------------------------------------- ```python from typing import Optional from marker.schema import BlockTypes from marker.schema.blocks import Block class SectionHeader(Block): block_type: BlockTypes = BlockTypes.SectionHeader heading_level: Optional[int] = None block_description: str = "The header of a section of text or other blocks." html: str | None = None def assemble_html( self, document, child_blocks, parent_structure, block_config=None ): if self.ignore_for_output: return "" if self.html: return super().handle_html_output( document, child_blocks, parent_structure, block_config ) template = super().assemble_html( document, child_blocks, parent_structure, block_config ) template = template.replace("\n", " ") tag = f"h{self.heading_level}" if self.heading_level else "h2" return f"<{tag}>{template}</{tag}>" ``` -------------------------------------------------------------------------------- /tests/renderers/test_chunk_renderer.py: -------------------------------------------------------------------------------- ```python import pytest from marker.renderers.chunk import ChunkRenderer @pytest.mark.config({"page_range": [0]}) def test_chunk_renderer(pdf_document): renderer = ChunkRenderer() chunk_output = renderer(pdf_document) blocks = chunk_output.blocks page_info = chunk_output.page_info assert len(blocks) == 14 assert blocks[0].block_type == "SectionHeader" assert page_info[0]["bbox"] is not None assert page_info[0]["polygon"] is not None figure_groups = [block for block in blocks if block.block_type == "FigureGroup"] figures = [block for block in blocks if block.block_type == "Figure"] captions = [block for block in blocks if block.block_type == "Caption"] assert len(figure_groups) == 1 assert len(figures) == 0 assert len(captions) == 0 figure_group = figure_groups[0] assert figure_group.images is not None assert len(figure_group.images) == 1 assert "<img src='/page/0/Figure/9'>" in figure_group.html ``` -------------------------------------------------------------------------------- /marker/providers/html.py: -------------------------------------------------------------------------------- ```python import os import tempfile from marker.providers.pdf import PdfProvider class HTMLProvider(PdfProvider): def __init__(self, filepath: str, config=None): temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") self.temp_pdf_path = temp_pdf.name temp_pdf.close() # Convert HTML to PDF try: self.convert_html_to_pdf(filepath) except Exception as e: raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}") # Initialize the PDF provider with the temp pdf path super().__init__(self.temp_pdf_path, config) def __del__(self): if os.path.exists(self.temp_pdf_path): os.remove(self.temp_pdf_path) def convert_html_to_pdf(self, filepath: str): from weasyprint import HTML font_css = self.get_font_css() HTML(filename=filepath, encoding="utf-8").write_pdf( self.temp_pdf_path, stylesheets=[font_css] ) ``` -------------------------------------------------------------------------------- /data/latex_to_md.sh: -------------------------------------------------------------------------------- ```bash #!/bin/bash # List all .tex files in the latex folder FILES=$(find latex -name "*.tex") for f in $FILES do echo "Processing $f file..." base_name=$(basename "$f" .tex) out_file="references/${base_name}.md" pandoc --wrap=none \ --no-highlight \ --strip-comments \ --from=latex \ --to=commonmark_x+pipe_tables \ "$f" \ -o "$out_file" # Replace non-breaking spaces sed -i .bak 's/ / /g' "$out_file" sed -i .bak 's/ / /g' "$out_file" sed -i .bak 's/ / /g' "$out_file" sed -i .bak 's/ / /g' "$out_file" sed -i.bak -E 's/`\\cite`//g; s/<[^>]*>//g; s/\{[^}]*\}//g; s/\\cite\{[^}]*\}//g' "$out_file" sed -i.bak -E ' s/`\\cite`//g; # Remove \cite commands inside backticks s/::: //g; # Remove the leading ::: for content markers s/\[//g; # Remove opening square bracket s/\]//g; # Remove closing square bracket ' "$out_file" # Remove .bak file rm "$out_file.bak" done ``` -------------------------------------------------------------------------------- /marker/schema/blocks/figure.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks import Block class Figure(Block): block_type: BlockTypes = BlockTypes.Figure description: str | None = None html: str | None = None block_description: str = "A chart or other image that contains data." def assemble_html( self, document, child_blocks, parent_structure, block_config=None ): if self.html: return super().handle_html_output( document, child_blocks, parent_structure, block_config ) child_ref_blocks = [ block for block in child_blocks if block.id.block_type == BlockTypes.Reference ] html = super().assemble_html( document, child_ref_blocks, parent_structure, block_config ) if self.description: html += f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>" return html ``` -------------------------------------------------------------------------------- /marker/schema/blocks/equation.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks import Block class Equation(Block): block_type: BlockTypes = BlockTypes.Equation html: str | None = None block_description: str = "A block math equation." def assemble_html( self, document, child_blocks, parent_structure=None, block_config=None ): if self.html: child_ref_blocks = [ block for block in child_blocks if block.id.block_type == BlockTypes.Reference ] html_out = super().assemble_html( document, child_ref_blocks, parent_structure, block_config ) html_out += f"""<p block-type='{self.block_type}'>{self.html}</p>""" return html_out else: template = super().assemble_html( document, child_blocks, parent_structure, block_config ) return f"<p block-type='{self.block_type}'>{template}</p>" ``` -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/output-bug-report.md: -------------------------------------------------------------------------------- ```markdown --- name: Output bug report about: Create a report about poor output quality title: "[BUG: Output]" labels: 'bug: output' assignees: '' --- ## 📝 Describe the Output Issue A clear and concise description of the incorrect or unexpected output. ## 📄 Input Document Attach the PDF or input file used. ## 📤 Current Output Paste the Markdown or HTML that Marker generated: ````markdown Paste output here ````` ## ✅ Expected Output Describe or paste what you expected Marker to generate. ## ⚙️ Environment Please fill in all relevant details: * **Marker version**: * **Surya version**: * **Python version**: * **PyTorch version**: * **Transformers version**: * **Operating System**: ## 📟 Command or Code Used Paste the **exact bash command** or **Python code** you used to run Marker: <details> <summary>Click to expand</summary> ```bash # or Python code block your_command_here --with-flags ``` </details> ## 📎 Additional Context Any other relevant info, configs, or assumptions. ``` -------------------------------------------------------------------------------- /tests/converters/test_table_converter.py: -------------------------------------------------------------------------------- ```python import pytest from marker.converters.table import TableConverter from marker.renderers.markdown import MarkdownOutput from marker.util import classes_to_strings def _table_converter(config, model_dict, renderer, temp_pdf): converter = TableConverter( artifact_dict=model_dict, processor_list=None, renderer=classes_to_strings([renderer])[0], config=config ) markdown_output: MarkdownOutput = converter(temp_pdf.name) markdown = markdown_output.markdown assert len(markdown) > 0 assert "cyclic" in markdown @pytest.mark.output_format("markdown") @pytest.mark.config({"page_range": [5]}) def test_table_converter(config, model_dict, renderer, temp_doc): _table_converter(config, model_dict, renderer, temp_doc) @pytest.mark.output_format("markdown") @pytest.mark.config({"page_range": [5], "force_ocr": True}) def test_table_converter_ocr(config, model_dict, renderer, temp_doc): _table_converter(config, model_dict, renderer, temp_doc) ``` -------------------------------------------------------------------------------- /tests/builders/test_blank_page.py: -------------------------------------------------------------------------------- ```python from surya.layout.schema import LayoutResult from marker.builders.document import DocumentBuilder from marker.builders.layout import LayoutBuilder from marker.builders.line import LineBuilder def test_blank_page(config, doc_provider, layout_model, ocr_error_model, detection_model): layout_builder = LayoutBuilder(layout_model, config) line_builder = LineBuilder(detection_model, ocr_error_model) builder = DocumentBuilder(config) document = builder.build_document(doc_provider) layout_results = [LayoutResult( bboxes=[], image_bbox=p.polygon.bbox, ) for p in document.pages] provider_lines = {p.page_id: [] for p in document.pages} ocr_lines = {p.page_id: [] for p in document.pages} layout_builder.add_blocks_to_pages(document.pages, layout_results) line_builder.merge_blocks(document, provider_lines, ocr_lines) assert all([isinstance(p.children, list) for p in document.pages]) assert all([isinstance(p.structure, list) for p in document.pages]) ``` -------------------------------------------------------------------------------- /marker/schema/blocks/complexregion.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks import Block class ComplexRegion(Block): block_type: BlockTypes = BlockTypes.ComplexRegion html: str | None = None block_description: str = "A complex region that can consist of multiple different types of blocks mixed with images. This block is chosen when it is difficult to categorize the region as a single block type." def assemble_html(self, document, child_blocks, parent_structure, block_config): if self.html: child_ref_blocks = [ block for block in child_blocks if block.id.block_type == BlockTypes.Reference ] html = super().assemble_html( document, child_ref_blocks, parent_structure, block_config ) return html + self.html else: template = super().assemble_html( document, child_blocks, parent_structure, block_config ) return f"<p>{template}</p>" ``` -------------------------------------------------------------------------------- /marker/schema/blocks/picture.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks import Block class Picture(Block): block_type: BlockTypes = BlockTypes.Picture description: str | None = None block_description: str = "An image block that represents a picture." html: str | None = None def assemble_html( self, document, child_blocks, parent_structure, block_config=None ): if self.html: return super().handle_html_output( document, child_blocks, parent_structure, block_config ) child_ref_blocks = [ block for block in child_blocks if block.id.block_type == BlockTypes.Reference ] html = super().assemble_html( document, child_ref_blocks, parent_structure, block_config ) if self.description: return ( html + f"<p role='img' data-original-image-id='{self.id}'>Image {self.id} description: {self.description}</p>" ) return html ``` -------------------------------------------------------------------------------- /.github/workflows/benchmarks.yml: -------------------------------------------------------------------------------- ```yaml name: Integration test on: [push] env: PYTHONIOENCODING: "utf-8" jobs: benchmark: runs-on: ${{ matrix.os }} strategy: matrix: os: [t4_gpu, ubuntu-latest] steps: - uses: actions/checkout@v3 - name: Set up Python 3.11 uses: actions/setup-python@v4 with: python-version: 3.11 - name: Install apt dependencies run: | sudo apt-get update sudo apt-get install -y pandoc - name: Install python dependencies run: | pip install poetry poetry install --extras "full" - name: Run benchmark test run: | poetry run python benchmarks/overall/overall.py --max_rows 5 poetry run python benchmarks/verify_scores.py conversion_results/benchmark/overall/result.json --type marker - name: Run table benchmark run: | poetry run python benchmarks/table/table.py --max_rows 5 poetry run python benchmarks/verify_scores.py conversion_results/benchmark/table/table.json --type table ``` -------------------------------------------------------------------------------- /marker/services/vertex.py: -------------------------------------------------------------------------------- ```python from typing import Annotated from google import genai from marker.services.gemini import BaseGeminiService class GoogleVertexService(BaseGeminiService): vertex_project_id: Annotated[ str, "Google Cloud Project ID for Vertex AI.", ] = None vertex_location: Annotated[ str, "Google Cloud Location for Vertex AI.", ] = "us-central1" gemini_model_name: Annotated[ str, "The name of the Google model to use for the service." ] = "gemini-2.0-flash-001" vertex_dedicated: Annotated[ bool, "Whether to use a dedicated Vertex AI instance." ] = False def get_google_client(self, timeout: int): http_options = {"timeout": timeout * 1000} # Convert to milliseconds if self.vertex_dedicated: http_options["headers"] = {"x-vertex-ai-llm-request-type": "dedicated"} return genai.Client( vertexai=True, project=self.vertex_project_id, location=self.vertex_location, http_options=http_options, ) ``` -------------------------------------------------------------------------------- /marker/scripts/chunk_convert.sh: -------------------------------------------------------------------------------- ```bash #!/bin/bash trap 'pkill -P $$' SIGINT # Check if NUM_DEVICES is set if [[ -z "$NUM_DEVICES" ]]; then echo "Please set the NUM_DEVICES environment variable." exit 1 fi if [[ -z "$NUM_WORKERS" ]]; then echo "Please set the NUM_WORKERS environment variable." exit 1 fi # Get input folder and output folder from args if [[ -z "$1" ]]; then echo "Please provide an input folder." exit 1 fi if [[ -z "$2" ]]; then echo "Please provide an output folder." exit 1 fi INPUT_FOLDER=$1 OUTPUT_FOLDER=$2 # Ensure output folder exists mkdir -p "$OUTPUT_FOLDER" # Loop from 0 to NUM_DEVICES and run the marker command in parallel for (( i=0; i<$NUM_DEVICES; i++ )); do DEVICE_NUM=$i export DEVICE_NUM export NUM_DEVICES export NUM_WORKERS echo "Running marker on GPU $DEVICE_NUM" cmd="CUDA_VISIBLE_DEVICES=$DEVICE_NUM marker $INPUT_FOLDER --output_dir $OUTPUT_FOLDER --num_chunks $NUM_DEVICES --chunk_idx $DEVICE_NUM --workers $NUM_WORKERS" eval $cmd & sleep 5 done # Wait for all background processes to finish wait ``` -------------------------------------------------------------------------------- /marker/schema/blocks/__init__.py: -------------------------------------------------------------------------------- ```python from __future__ import annotations from marker.schema.blocks.base import Block, BlockId, BlockOutput from marker.schema.blocks.caption import Caption from marker.schema.blocks.code import Code from marker.schema.blocks.figure import Figure from marker.schema.blocks.footnote import Footnote from marker.schema.blocks.form import Form from marker.schema.blocks.equation import Equation from marker.schema.blocks.handwriting import Handwriting from marker.schema.blocks.inlinemath import InlineMath from marker.schema.blocks.listitem import ListItem from marker.schema.blocks.pagefooter import PageFooter from marker.schema.blocks.pageheader import PageHeader from marker.schema.blocks.picture import Picture from marker.schema.blocks.sectionheader import SectionHeader from marker.schema.blocks.table import Table from marker.schema.blocks.text import Text from marker.schema.blocks.toc import TableOfContents from marker.schema.blocks.complexregion import ComplexRegion from marker.schema.blocks.tablecell import TableCell from marker.schema.blocks.reference import Reference ``` -------------------------------------------------------------------------------- /marker/schema/blocks/tablecell.py: -------------------------------------------------------------------------------- ```python from typing import List from marker.schema import BlockTypes from marker.schema.blocks import Block class TableCell(Block): block_type: BlockTypes = BlockTypes.TableCell rowspan: int colspan: int row_id: int col_id: int is_header: bool text_lines: List[str] | None = None block_description: str = "A cell in a table." @property def text(self): return "\n".join(self.text_lines) def assemble_html( self, document, child_blocks, parent_structure=None, block_config=None ): add_cell_id = block_config and block_config.get("add_block_ids", False) tag_cls = "th" if self.is_header else "td" tag = f"<{tag_cls}" if self.rowspan > 1: tag += f" rowspan={self.rowspan}" if self.colspan > 1: tag += f" colspan={self.colspan}" if add_cell_id: tag += f' data-block-id="{self.id}"' if self.text_lines is None: self.text_lines = [] text = "<br>".join(self.text_lines) return f"{tag}>{text}</{tag_cls}>" ``` -------------------------------------------------------------------------------- /benchmarks/verify_scores.py: -------------------------------------------------------------------------------- ```python import json import argparse def verify_scores(file_path): with open(file_path, 'r') as file: data = json.load(file) raw_scores = [data["scores"][k] for k in data["scores"]] marker_scores = [r["marker"]["heuristic"]["score"] for r in raw_scores] marker_score = sum(marker_scores) / len(marker_scores) if marker_score < 90: raise ValueError("Marker score below 90") def verify_table_scores(file_path): with open(file_path, 'r') as file: data = json.load(file) avg = sum([r["marker_score"] for r in data["marker"]]) / len(data) if avg < 0.7: raise ValueError("Average score is below the required threshold of 0.7") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Verify benchmark scores") parser.add_argument("file_path", type=str, help="Path to the json file") parser.add_argument("--type", type=str, help="Type of file to verify", default="marker") args = parser.parse_args() if args.type == "marker": verify_scores(args.file_path) elif args.type == "table": verify_table_scores(args.file_path) ``` -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/breaking-bug-report.md: -------------------------------------------------------------------------------- ```markdown --- name: Breaking bug report about: Create a report about a breaking bug title: "[BUG: Breaking]" labels: 'bug: breaking' assignees: '' --- ## 🧨 Describe the Bug A clear and concise description of the breaking issue (e.g., crash, OOM, exception, etc). ## 📄 Input Document Attach the PDF or input file that triggered the error. ## 📤 Output Trace / Stack Trace Paste the **complete** stack trace or error output, if available. <details> <summary>Click to expand</summary> ``` Paste stack trace here ``` </details> ## ⚙️ Environment Please fill in all relevant details: - **Marker version**: - **Surya version**: - **Python version**: - **PyTorch version**: - **Transformers version**: - **Operating System** (incl. container info if relevant): ## ✅ Expected Behavior What did you expect Marker to do? ## 📟 Command or Code Used Paste the **exact bash command** or **Python code** you used to run Marker: <details> <summary>Click to expand</summary> ```bash # or Python code block your_command_here --with-flags ``` </details> ## 📎 Additional Context Any other context that might help us debug this (e.g., CLI options, working directory, runtime settings). ``` -------------------------------------------------------------------------------- /marker/models.py: -------------------------------------------------------------------------------- ```python import os os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = ( "1" # Transformers uses .isin for an op, which is not supported on MPS ) from surya.foundation import FoundationPredictor from surya.detection import DetectionPredictor from surya.layout import LayoutPredictor from surya.ocr_error import OCRErrorPredictor from surya.recognition import RecognitionPredictor from surya.table_rec import TableRecPredictor from surya.settings import settings as surya_settings def create_model_dict( device=None, dtype=None, attention_implementation: str | None = None ) -> dict: return { "layout_model": LayoutPredictor(FoundationPredictor(checkpoint=surya_settings.LAYOUT_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)), "recognition_model": RecognitionPredictor(FoundationPredictor(checkpoint=surya_settings.RECOGNITION_MODEL_CHECKPOINT, attention_implementation=attention_implementation, device=device, dtype=dtype)), "table_rec_model": TableRecPredictor(device=device, dtype=dtype), "detection_model": DetectionPredictor(device=device, dtype=dtype), "ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype), } ``` -------------------------------------------------------------------------------- /marker/extractors/__init__.py: -------------------------------------------------------------------------------- ```python from typing import Annotated, Sequence from marker.schema import BlockTypes from marker.schema.document import Document from marker.schema.groups import PageGroup from PIL import Image from marker.services import BaseService from marker.util import assign_config class BaseExtractor: """ An extractor that uses a provided service to extract structured data from documents. """ max_concurrency: Annotated[ int, "The maximum number of concurrent requests to make to the Gemini model.", ] = 3 disable_tqdm: Annotated[ bool, "Whether to disable the tqdm progress bar.", ] = False def __init__(self, llm_service: BaseService, config=None): assign_config(self, config) self.llm_service = llm_service def extract_image( self, document: Document, page: PageGroup, remove_blocks: Sequence[BlockTypes] | None = None, highres: bool = False, # Default False to save tokens ) -> Image.Image: return page.get_image( document, highres=highres, remove_blocks=remove_blocks, ) def __call__(self, document: Document, *args, **kwargs): raise NotImplementedError ``` -------------------------------------------------------------------------------- /marker/scripts/file_to_s3.py: -------------------------------------------------------------------------------- ```python import json import shutil import datetime from pathlib import Path import boto3 from huggingface_hub import snapshot_download import click S3_API_URL = "https://1afbe4656a6b40d982ab5e730a39f6b9.r2.cloudflarestorage.com" @click.command(help="Uploads files to an S3 bucket") @click.argument("filepath", type=str) @click.argument("s3_path", type=str) @click.option("--bucket_name", type=str, default="datalab") @click.option("--access_key_id", type=str, default="<access_key_id>") @click.option("--access_key_secret", type=str, default="<access_key_secret>") def main(filepath: str, s3_path: str, bucket_name: str, access_key_id: str, access_key_secret: str): filepath = Path(filepath) # Upload the files to S3 s3_client = boto3.client( 's3', endpoint_url=S3_API_URL, aws_access_key_id=access_key_id, aws_secret_access_key=access_key_secret, region_name="enam" ) s3_key = f"{s3_path}/{filepath.name}" try: s3_client.upload_file( str(filepath), bucket_name, s3_key ) except Exception as e: print(f"Error uploading {filepath}: {str(e)}") print(f"Uploaded files to {s3_path}") if __name__ == "__main__": main() ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/marker.py: -------------------------------------------------------------------------------- ```python import os import tempfile import time from benchmarks.overall.methods import BaseMethod, BenchmarkResult from marker.config.parser import ConfigParser from marker.converters.pdf import PdfConverter class MarkerMethod(BaseMethod): model_dict: dict = None use_llm: bool = False def __call__(self, sample) -> BenchmarkResult: pdf_bytes = sample["pdf"] # This is a single page PDF parser = ConfigParser({ "page_range": "0", "disable_tqdm": True, "use_llm": self.use_llm, "redo_inline_math": self.use_llm, "llm_service": "marker.services.vertex.GoogleVertexService", "vertex_project_id": os.getenv("VERTEX_PROJECT_ID"), }) block_converter = PdfConverter( artifact_dict=self.model_dict, config=parser.generate_config_dict(), llm_service=parser.get_llm_service() ) with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: f.write(pdf_bytes) start = time.time() rendered = block_converter(f.name) total = time.time() - start return { "markdown": rendered.markdown, "time": total } ``` -------------------------------------------------------------------------------- /marker/utils/image.py: -------------------------------------------------------------------------------- ```python from PIL import Image import numpy as np import cv2 from typing import List, Optional def is_blank_image(image: Image.Image, polygon: Optional[List[List[int]]] = None) -> bool: image = np.asarray(image) if ( image is None or image.size == 0 or image.shape[0] == 0 or image.shape[1] == 0 ): # Handle empty image case return True if polygon is not None: rounded_polys = [[int(corner[0]), int(corner[1])] for corner in polygon] if rounded_polys[0] == rounded_polys[1] and rounded_polys[2] == rounded_polys[3]: return True gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) gray = cv2.GaussianBlur(gray, (7, 7), 0) # Adaptive threshold (inverse for text as white) binarized = cv2.adaptiveThreshold( gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15 ) num_labels, labels, stats, _ = cv2.connectedComponentsWithStats( binarized, connectivity=8 ) cleaned = np.zeros_like(binarized) for i in range(1, num_labels): # skip background cleaned[labels == i] = 255 kernel = np.ones((1, 5), np.uint8) dilated = cv2.dilate(cleaned, kernel, iterations=3) b = dilated / 255 return bool(b.sum() == 0) ``` -------------------------------------------------------------------------------- /marker/schema/blocks/text.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks import Block class Text(Block): block_type: BlockTypes = BlockTypes.Text has_continuation: bool = False blockquote: bool = False blockquote_level: int = 0 html: str | None = None block_description: str = "A paragraph or line of text." def assemble_html( self, document, child_blocks, parent_structure, block_config=None ): if self.ignore_for_output: return "" # This happens when we used an llm processor if self.html: return super().handle_html_output( document, child_blocks, parent_structure, block_config ) template = super().assemble_html( document, child_blocks, parent_structure, block_config ) template = template.replace("\n", " ") el_attr = f" block-type='{self.block_type}'" if self.has_continuation: el_attr += " class='has-continuation'" if self.blockquote: blockquote_prefix = "<blockquote>" * self.blockquote_level blockquote_suffix = "</blockquote>" * self.blockquote_level return f"{blockquote_prefix}<p{el_attr}>{template}</p>{blockquote_suffix}" else: return f"<p{el_attr}>{template}</p>" ``` -------------------------------------------------------------------------------- /.github/workflows/cla.yml: -------------------------------------------------------------------------------- ```yaml name: "Marker CLA Assistant" on: issue_comment: types: [created] pull_request_target: types: [opened,closed,synchronize] # explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings permissions: actions: write contents: write pull-requests: write statuses: write jobs: CLAAssistant: runs-on: ubuntu-latest steps: - name: "Marker CLA Assistant" if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target' uses: contributor-assistant/[email protected] env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # the below token should have repo scope and must be manually added by you in the repository's secret # This token is required only if you have configured to store the signatures in a remote repository/organization PERSONAL_ACCESS_TOKEN: ${{ secrets.PERSONAL_ACCESS_TOKEN }} with: path-to-signatures: 'signatures/version1/cla.json' path-to-document: 'https://github.com/VikParuchuri/marker/blob/master/CLA.md' # branch should not be protected branch: 'master' allowlist: VikParuchuri,Sandy ``` -------------------------------------------------------------------------------- /tests/builders/test_overriding.py: -------------------------------------------------------------------------------- ```python import multiprocessing as mp import pytest from marker.providers.pdf import PdfProvider from marker.schema import BlockTypes from marker.schema.blocks import SectionHeader from marker.schema.document import Document from marker.schema.registry import register_block_class from marker.schema.text import Line from tests.utils import setup_pdf_provider class NewSectionHeader(SectionHeader): pass class NewLine(Line): pass @pytest.mark.config({ "page_range": [0], "override_map": {BlockTypes.SectionHeader: NewSectionHeader} }) def test_overriding(pdf_document: Document): assert pdf_document.pages[0]\ .get_block(pdf_document.pages[0].structure[0]).__class__ == NewSectionHeader def get_lines(pdf: str, config=None): for block_type, block_cls in config["override_map"].items(): register_block_class(block_type, block_cls) provider: PdfProvider = setup_pdf_provider(pdf, config) return provider.get_page_lines(0) def test_overriding_mp(): config = { "page_range": [0], "override_map": {BlockTypes.Line: NewLine} } pdf_list = ["adversarial.pdf", "adversarial_rot.pdf"] with mp.Pool(processes=2) as pool: results = pool.starmap(get_lines, [(pdf, config) for pdf in pdf_list]) assert all([r[0].line.__class__ == NewLine for r in results]) ``` -------------------------------------------------------------------------------- /.github/workflows/scripts.yml: -------------------------------------------------------------------------------- ```yaml name: Test CLI scripts on: [push] jobs: tests: runs-on: t4_gpu steps: - uses: actions/checkout@v3 - name: Set up Python 3.11 uses: actions/setup-python@v4 with: python-version: 3.11 - name: Install python dependencies run: | pip install poetry poetry install --extras "full" - name: Download benchmark data run: | wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1NHrdYatR1rtqs2gPVfdvO0BAvocH8CJi" unzip -o benchmark_data.zip - name: Test single script run: poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 - name: Test convert script run: poetry run marker benchmark_data/pdfs --max_files 1 --page_range 0 - name: Text convert script multiple workers run: poetry run marker benchmark_data/pdfs --max_files 2 --page_range 0-5 - name: Test llm option run: | poetry run marker_single benchmark_data/pdfs/switch_trans.pdf --page_range 0 --use_llm > output.txt || echo "Command failed but continuing" if ! grep -q "UserWarning" output.txt; then echo "Success: No UserWarning found" exit 0 else echo "Error: UserWarning found in output" exit 1 fi ``` -------------------------------------------------------------------------------- /marker/processors/footnote.py: -------------------------------------------------------------------------------- ```python import re from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.document import Document from marker.schema.groups import PageGroup class FootnoteProcessor(BaseProcessor): """ A processor for pushing footnotes to the bottom, and relabeling mislabeled text blocks. """ block_types = (BlockTypes.Footnote,) def __call__(self, document: Document): for page in document.pages: self.push_footnotes_to_bottom(page, document) self.assign_superscripts(page, document) def push_footnotes_to_bottom(self, page: PageGroup, document: Document): footnote_blocks = page.contained_blocks(document, self.block_types) # Push footnotes to the bottom for block in footnote_blocks: # Check if it is top-level if block.id in page.structure: # Move to bottom if it is page.structure.remove(block.id) page.add_structure(block) def assign_superscripts(self, page: PageGroup, document: Document): footnote_blocks = page.contained_blocks(document, self.block_types) for block in footnote_blocks: for span in block.contained_blocks(document, (BlockTypes.Span,)): if re.match(r"^[0-9\W]+", span.text): span.has_superscript = True break ``` -------------------------------------------------------------------------------- /marker/scripts/convert_single.py: -------------------------------------------------------------------------------- ```python import os os.environ["GRPC_VERBOSITY"] = "ERROR" os.environ["GLOG_minloglevel"] = "2" os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = ( "1" # Transformers uses .isin for a simple op, which is not supported on MPS ) import time import click from marker.config.parser import ConfigParser from marker.config.printer import CustomClickPrinter from marker.logger import configure_logging, get_logger from marker.models import create_model_dict from marker.output import save_output configure_logging() logger = get_logger() @click.command(cls=CustomClickPrinter, help="Convert a single PDF to markdown.") @click.argument("fpath", type=str) @ConfigParser.common_options def convert_single_cli(fpath: str, **kwargs): models = create_model_dict() start = time.time() config_parser = ConfigParser(kwargs) converter_cls = config_parser.get_converter_cls() converter = converter_cls( config=config_parser.generate_config_dict(), artifact_dict=models, processor_list=config_parser.get_processors(), renderer=config_parser.get_renderer(), llm_service=config_parser.get_llm_service(), ) rendered = converter(fpath) out_folder = config_parser.get_output_folder(fpath) save_output(rendered, out_folder, config_parser.get_base_filename(fpath)) logger.info(f"Saved markdown to {out_folder}") logger.info(f"Total time: {time.time() - start}") ``` -------------------------------------------------------------------------------- /marker/schema/blocks/inlinemath.py: -------------------------------------------------------------------------------- ```python from marker.schema import BlockTypes from marker.schema.blocks import Block class InlineMath(Block): block_type: BlockTypes = BlockTypes.TextInlineMath has_continuation: bool = False blockquote: bool = False blockquote_level: int = 0 block_description: str = "A text block that contains inline math. This is not used for italic text or references - only for text that contains math." html: str | None = None def assemble_html( self, document, child_blocks, parent_structure, block_config=None ): if self.ignore_for_output: return "" if self.html: return super().handle_html_output( document, child_blocks, parent_structure, block_config ) template = super().assemble_html( document, child_blocks, parent_structure, block_config ) template = template.replace("\n", " ") el_attr = f" block-type='{self.block_type}'" if self.has_continuation: el_attr += " class='has-continuation'" if self.blockquote: # Add indentation for blockquote levels blockquote_prefix = "<blockquote>" * self.blockquote_level blockquote_suffix = "</blockquote>" * self.blockquote_level return f"{blockquote_prefix}<p{el_attr}>{template}</p>{blockquote_suffix}" else: return f"<p{el_attr}>{template}</p>" ``` -------------------------------------------------------------------------------- /tests/builders/test_layout_replace.py: -------------------------------------------------------------------------------- ```python import pytest from marker.builders.document import DocumentBuilder from marker.builders.layout import LayoutBuilder from marker.builders.line import LineBuilder from marker.renderers.markdown import MarkdownRenderer from marker.schema import BlockTypes from marker.schema.registry import get_block_class @pytest.mark.filename("thinkpython.pdf") @pytest.mark.config({"page_range": [0]}) def test_layout_replace( request, config, doc_provider, layout_model, ocr_error_model, detection_model ): # The llm layout builder replaces blocks - this makes sure text is still merged properly layout_builder = LayoutBuilder(layout_model, config) line_builder = LineBuilder(detection_model, ocr_error_model, config) builder = DocumentBuilder(config) document = builder.build_document(doc_provider) layout_builder(document, doc_provider) page = document.pages[0] new_blocks = [] for block in page.contained_blocks(document, (BlockTypes.Text,)): generated_block_class = get_block_class(BlockTypes.TextInlineMath) generated_block = generated_block_class( polygon=block.polygon, page_id=block.page_id, structure=block.structure, ) page.replace_block(block, generated_block) new_blocks.append(generated_block) line_builder(document, doc_provider) for block in new_blocks: assert block.raw_text(document).strip() renderer = MarkdownRenderer(config) rendered = renderer(document) assert "Think Python" in rendered.markdown ``` -------------------------------------------------------------------------------- /marker/processors/code.py: -------------------------------------------------------------------------------- ```python from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.blocks import Code from marker.schema.document import Document class CodeProcessor(BaseProcessor): """ A processor for formatting code blocks. """ block_types = (BlockTypes.Code, ) def __call__(self, document: Document): for page in document.pages: for block in page.contained_blocks(document, self.block_types): self.format_block(document, block) def format_block(self, document: Document, block: Code): min_left = 9999 # will contain x- coord of column 0 total_width = 0 total_chars = 0 contained_lines = block.contained_blocks(document, (BlockTypes.Line,)) for line in contained_lines: min_left = min(line.polygon.bbox[0], min_left) total_width += line.polygon.width total_chars += len(line.raw_text(document)) avg_char_width = total_width / max(total_chars, 1) code_text = "" is_new_line = False for line in contained_lines: text = line.raw_text(document) if avg_char_width == 0: prefix = "" else: total_spaces = int((line.polygon.bbox[0] - min_left) / avg_char_width) prefix = " " * max(0, total_spaces) if is_new_line: text = prefix + text code_text += text is_new_line = text.endswith("\n") block.code = code_text.rstrip() ``` -------------------------------------------------------------------------------- /marker/settings.py: -------------------------------------------------------------------------------- ```python from typing import Optional from dotenv import find_dotenv from pydantic import computed_field from pydantic_settings import BaseSettings import torch import os class Settings(BaseSettings): # Paths BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) OUTPUT_DIR: str = os.path.join(BASE_DIR, "conversion_results") FONT_DIR: str = os.path.join(BASE_DIR, "static", "fonts") DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data") ARTIFACT_URL: str = "https://models.datalab.to/artifacts" FONT_NAME: str = "GoNotoCurrent-Regular.ttf" FONT_PATH: str = os.path.join(FONT_DIR, FONT_NAME) LOGLEVEL: str = "INFO" # General OUTPUT_ENCODING: str = "utf-8" OUTPUT_IMAGE_FORMAT: str = "JPEG" # LLM GOOGLE_API_KEY: Optional[str] = "" # General models TORCH_DEVICE: Optional[str] = ( None # Note: MPS device does not work for text detection, and will default to CPU ) @computed_field @property def TORCH_DEVICE_MODEL(self) -> str: if self.TORCH_DEVICE is not None: return self.TORCH_DEVICE if torch.cuda.is_available(): return "cuda" if torch.backends.mps.is_available(): return "mps" return "cpu" @computed_field @property def MODEL_DTYPE(self) -> torch.dtype: if self.TORCH_DEVICE_MODEL == "cuda": return torch.bfloat16 else: return torch.float32 class Config: env_file = find_dotenv("local.env") extra = "ignore" settings = Settings() ``` -------------------------------------------------------------------------------- /marker/converters/ocr.py: -------------------------------------------------------------------------------- ```python from typing import Tuple from marker.builders.document import DocumentBuilder from marker.builders.line import LineBuilder from marker.builders.ocr import OcrBuilder from marker.converters.pdf import PdfConverter from marker.processors import BaseProcessor from marker.processors.equation import EquationProcessor from marker.providers.registry import provider_from_filepath from marker.renderers.ocr_json import OCRJSONRenderer class OCRConverter(PdfConverter): default_processors: Tuple[BaseProcessor, ...] = (EquationProcessor,) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if not self.config: self.config = {} self.config["force_ocr"] = True self.renderer = OCRJSONRenderer def build_document(self, filepath: str): provider_cls = provider_from_filepath(filepath) layout_builder = self.resolve_dependencies(self.layout_builder_class) line_builder = self.resolve_dependencies(LineBuilder) ocr_builder = self.resolve_dependencies(OcrBuilder) document_builder = DocumentBuilder(self.config) provider = provider_cls(filepath, self.config) document = document_builder(provider, layout_builder, line_builder, ocr_builder) for processor in self.processor_list: processor(document) return document def __call__(self, filepath: str): document = self.build_document(filepath) self.page_count = len(document.pages) renderer = self.resolve_dependencies(self.renderer) return renderer(document) ``` -------------------------------------------------------------------------------- /tests/builders/test_document_builder.py: -------------------------------------------------------------------------------- ```python import pytest from marker.schema import BlockTypes from marker.schema.text.line import Line @pytest.mark.filename("thinkpython.pdf") @pytest.mark.config({"page_range": [0]}) def test_document_builder(pdf_document): first_page = pdf_document.pages[0] assert first_page.structure[0] == "/page/0/SectionHeader/0" first_block = first_page.get_block(first_page.structure[0]) assert first_block.block_type == BlockTypes.SectionHeader assert first_block.text_extraction_method == "pdftext" first_text_block: Line = first_page.get_block(first_block.structure[0]) assert first_text_block.block_type == BlockTypes.Line first_span = first_page.get_block(first_text_block.structure[0]) assert first_span.block_type == BlockTypes.Span assert first_span.text == "Think Python" assert first_span.font == "URWPalladioL-Roma" assert first_span.formats == ["plain"] @pytest.mark.config({"page_range": [0]}) def test_document_builder_inline_eq(pdf_document): first_page = pdf_document.pages[0] assert first_page.structure[0] == "/page/0/SectionHeader/0" first_block = first_page.get_block(first_page.structure[0]) assert first_block.block_type == BlockTypes.SectionHeader assert first_block.text_extraction_method == "surya" first_text_block: Line = first_page.get_block(first_block.structure[0]) assert first_text_block.block_type == BlockTypes.Line first_span = first_page.get_block(first_text_block.structure[0]) assert first_span.block_type == BlockTypes.Span assert first_span.text.strip() == "Subspace Adversarial Training" assert "bold" in first_span.formats ``` -------------------------------------------------------------------------------- /marker/services/__init__.py: -------------------------------------------------------------------------------- ```python from typing import Optional, List, Annotated from io import BytesIO import PIL from pydantic import BaseModel from marker.schema.blocks import Block from marker.util import assign_config, verify_config_keys import base64 class BaseService: timeout: Annotated[int, "The timeout to use for the service."] = 30 max_retries: Annotated[ int, "The maximum number of retries to use for the service." ] = 2 retry_wait_time: Annotated[int, "The wait time between retries."] = 3 max_output_tokens: Annotated[ int, "The maximum number of output tokens to generate." ] = None def img_to_base64(self, img: PIL.Image.Image, format: str = "WEBP"): image_bytes = BytesIO() img.save(image_bytes, format=format) return base64.b64encode(image_bytes.getvalue()).decode("utf-8") def process_images(self, images: List[PIL.Image.Image]) -> list: raise NotImplementedError def format_image_for_llm(self, image): if not image: return [] if not isinstance(image, list): image = [image] image_parts = self.process_images(image) return image_parts def __init__(self, config: Optional[BaseModel | dict] = None): assign_config(self, config) # Ensure we have all necessary fields filled out (API keys, etc.) verify_config_keys(self) def __call__( self, prompt: str, image: PIL.Image.Image | List[PIL.Image.Image] | None, block: Block | None, response_schema: type[BaseModel], max_retries: int | None = None, timeout: int | None = None, ): raise NotImplementedError ``` -------------------------------------------------------------------------------- /tests/providers/test_document_providers.py: -------------------------------------------------------------------------------- ```python import pytest @pytest.mark.config({"page_range": [0]}) @pytest.mark.filename("lambda.pptx") def test_pptx_provider(doc_provider): assert doc_provider.get_images([0], 72)[0].size == (842, 596) page_lines = doc_provider.get_page_lines(0) spans = page_lines[0].spans assert spans[0].text == "Lambda Calculus" spans = page_lines[1].spans assert spans[0].text == "CSE 340 – Principles of Programming Languages" @pytest.mark.config({"page_range": [0]}) @pytest.mark.filename("manual.epub") def test_epub_provider(doc_provider): assert doc_provider.get_images([0], 72)[0].size == (596, 842) page_lines = doc_provider.get_page_lines(0) spans = page_lines[0].spans assert spans[0].text == "The Project Gutenberg eBook of Simple" @pytest.mark.config({"page_range": [0]}) @pytest.mark.filename("china.html") def test_html_provider(doc_provider): assert doc_provider.get_images([0], 72)[0].size == (596, 842) page_lines = doc_provider.get_page_lines(0) spans = page_lines[0].spans assert spans[0].text == "Jump to content" @pytest.mark.config({"page_range": [0]}) @pytest.mark.filename("gatsby.docx") def test_docx_provider(doc_provider): assert doc_provider.get_images([0], 72)[0].size == (596, 842) page_lines = doc_provider.get_page_lines(0) spans = page_lines[0].spans assert spans[0].text == "Themes" @pytest.mark.config({"page_range": [0]}) @pytest.mark.filename("single_sheet.xlsx") def test_xlsx_provider(doc_provider): assert doc_provider.get_images([0], 72)[0].size == (842, 596) page_lines = doc_provider.get_page_lines(0) spans = page_lines[0].spans assert spans[0].text == "Sheet1" ``` -------------------------------------------------------------------------------- /marker/schema/blocks/listitem.py: -------------------------------------------------------------------------------- ```python import re from marker.schema import BlockTypes from marker.schema.blocks import Block def replace_bullets(child_blocks): # Replace bullet characters with a - first_block = None while len(child_blocks) > 0: first_block = child_blocks[0] child_blocks = first_block.children if first_block is not None and first_block.id.block_type == BlockTypes.Line: bullet_pattern = r"(^|[\n ]|<[^>]*>)[•●○ഠ ം◦■▪▫–—-]( )" first_block.html = re.sub(bullet_pattern, r"\1\2", first_block.html) class ListItem(Block): block_type: BlockTypes = BlockTypes.ListItem list_indent_level: int = 0 block_description: str = "A list item that is part of a list. This block is used to represent a single item in a list." html: str | None = None def assemble_html( self, document, child_blocks, parent_structure, block_config=None ): template = super().assemble_html( document, child_blocks, parent_structure, block_config ) template = template.replace("\n", " ") # Remove the first bullet character replace_bullets(child_blocks) if self.html: template = ( super() .handle_html_output( document, child_blocks, parent_structure, block_config ) .strip() ) template = template.replace("<li>", "").replace("</li>", "") el_attr = f" block-type='{self.block_type}'" if self.list_indent_level: return f"<ul><li{el_attr} class='list-indent-{self.list_indent_level}'>{template}</li></ul>" return f"<li{el_attr}>{template}</li>" ``` -------------------------------------------------------------------------------- /marker/providers/image.py: -------------------------------------------------------------------------------- ```python from typing import List, Annotated from PIL import Image from marker.providers import ProviderPageLines, BaseProvider from marker.schema.polygon import PolygonBox from marker.schema.text import Line from pdftext.schema import Reference class ImageProvider(BaseProvider): page_range: Annotated[ List[int], "The range of pages to process.", "Default is None, which will process all pages.", ] = None image_count: int = 1 def __init__(self, filepath: str, config=None): super().__init__(filepath, config) self.images = [Image.open(filepath)] self.page_lines: ProviderPageLines = {i: [] for i in range(self.image_count)} if self.page_range is None: self.page_range = range(self.image_count) assert max(self.page_range) < self.image_count and min(self.page_range) >= 0, ( f"Invalid page range, values must be between 0 and {len(self.doc) - 1}. Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}." ) self.page_bboxes = { i: [0, 0, self.images[i].size[0], self.images[i].size[1]] for i in self.page_range } def __len__(self): return self.image_count def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]: return [self.images[i] for i in idxs] def get_page_bbox(self, idx: int) -> PolygonBox | None: bbox = self.page_bboxes[idx] if bbox: return PolygonBox.from_bbox(bbox) def get_page_lines(self, idx: int) -> List[Line]: return self.page_lines[idx] def get_page_refs(self, idx: int) -> List[Reference]: return [] ``` -------------------------------------------------------------------------------- /tests/builders/test_ocr_pipeline.py: -------------------------------------------------------------------------------- ```python import pytest from marker.schema import BlockTypes from marker.schema.text.line import Line def _ocr_pipeline_test(pdf_document): first_page = pdf_document.pages[0] assert first_page.structure[0] == "/page/0/SectionHeader/0" first_block = first_page.get_block(first_page.structure[0]) assert first_block.text_extraction_method == "surya" assert first_block.block_type == BlockTypes.SectionHeader first_text_block: Line = first_page.get_block(first_block.structure[0]) assert first_text_block.block_type == BlockTypes.Line first_span = first_page.get_block(first_text_block.structure[0]) assert first_span.block_type == BlockTypes.Span assert first_span.text.strip() == "Subspace Adversarial Training" # Ensure we match all text lines up properly # Makes sure the OCR bbox is being scaled to the same scale as the layout boxes text_lines = first_page.contained_blocks(pdf_document, (BlockTypes.Line,)) text_blocks = first_page.contained_blocks( pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath) ) # assert len(text_lines) == 83 # Ensure the bbox sizes match up max_line_position = max([line.polygon.y_end for line in text_lines]) max_block_position = max( [block.polygon.y_end for block in text_blocks if block.source == "layout"] ) assert max_line_position <= (max_block_position * 1.02) @pytest.mark.config({"force_ocr": True, "page_range": [0]}) def test_ocr_pipeline(pdf_document): _ocr_pipeline_test(pdf_document) @pytest.mark.config({"force_ocr": True, "page_range": [0], "use_llm": True}) def test_ocr_with_inline_pipeline(pdf_document): _ocr_pipeline_test(pdf_document) ``` -------------------------------------------------------------------------------- /benchmarks/table/gemini.py: -------------------------------------------------------------------------------- ```python import json from PIL import Image from google import genai from google.genai import types from io import BytesIO from pydantic import BaseModel from marker.settings import settings prompt = """ You're an expert document analyst who is good at turning tables in documents into HTML. Analyze the provided image, and convert it to a faithful HTML representation. Guidelines: - Keep the HTML simple and concise. - Only include the <table> tag and contents. - Only use <table>, <tr>, and <td> tags. Only use the colspan and rowspan attributes if necessary. Do not use <tbody>, <thead>, or <th> tags. - Make sure the table is as faithful to the image as possible with the given tags. **Instructions** 1. Analyze the image, and determine the table structure. 2. Convert the table image to HTML, following the guidelines above. 3. Output only the HTML for the table, starting with the <table> tag and ending with the </table> tag. """.strip() class TableSchema(BaseModel): table_html: str def gemini_table_rec(image: Image.Image): client = genai.Client( api_key=settings.GOOGLE_API_KEY, http_options={"timeout": 60000} ) image_bytes = BytesIO() image.save(image_bytes, format="PNG") responses = client.models.generate_content( model="gemini-2.0-flash", contents=[types.Part.from_bytes(data=image_bytes.getvalue(), mime_type="image/png"), prompt], # According to gemini docs, it performs better if the image is the first element config={ "temperature": 0, "response_schema": TableSchema, "response_mime_type": "application/json", }, ) output = responses.candidates[0].content.parts[0].text return json.loads(output)["table_html"] ``` -------------------------------------------------------------------------------- /tests/builders/test_pdf_links.py: -------------------------------------------------------------------------------- ```python import re import pytest from marker.converters.pdf import PdfConverter from marker.renderers.markdown import MarkdownOutput from marker.schema import BlockTypes from marker.schema.document import Document from marker.util import classes_to_strings @pytest.mark.filename("arxiv_test.pdf") @pytest.mark.output_format("markdown") @pytest.mark.config({"disable_ocr": True}) def test_pdf_links(pdf_document: Document, config, renderer, model_dict, temp_doc): first_page = pdf_document.pages[1] processors = ["marker.processors.reference.ReferenceProcessor"] pdf_converter = PdfConverter( artifact_dict=model_dict, processor_list=processors, renderer=classes_to_strings([renderer])[0], config=config, ) for section_header_span in first_page.contained_blocks( pdf_document, (BlockTypes.Span,) ): if "II." in section_header_span.text: assert section_header_span.url == "#page-1-0" break else: raise ValueError("Could not find II. in the first page") section_header_block = first_page.contained_blocks( pdf_document, (BlockTypes.SectionHeader,) )[0] assert section_header_block.raw_text(pdf_document) == "II. THEORETICAL FRAMEWORK\n" assert first_page.refs[0].ref == "page-1-0" markdown_output: MarkdownOutput = pdf_converter(temp_doc.name) markdown = markdown_output.markdown assert "[II.](#page-1-0)" in markdown assert '<span id="page-1-0"></span>II. THEORETICAL FRAMEWORK' in markdown for ref in set( [ f'<span id="page-{m[0]}-{m[1]}">' for m in re.findall(r"\]\(#page-(\d+)-(\d+)\)", markdown) ] ): assert ref in markdown, f"Reference {ref} not found in markdown" ``` -------------------------------------------------------------------------------- /benchmarks/overall/download/llamaparse.py: -------------------------------------------------------------------------------- ```python import io import time import requests from benchmarks.overall.download.base import Downloader class LlamaParseDownloader(Downloader): service = "llamaparse" def get_html(self, pdf_bytes): rand_name = str(time.time()) + ".pdf" start = time.time() buff = io.BytesIO(pdf_bytes) md = upload_and_parse_file(self.api_key, rand_name, buff) end = time.time() if isinstance(md, bytes): md = md.decode("utf-8") return { "md": md, "time": end - start, } def upload_and_parse_file(api_key: str, fname: str, buff, max_retries: int = 180, delay: int = 1): headers = { "Authorization": f"Bearer {api_key}", "Accept": "application/json" } # Upload file files = { 'file': (fname, buff, 'application/pdf') } response = requests.post( 'https://api.cloud.llamaindex.ai/api/v1/parsing/upload', headers=headers, files=files ) response.raise_for_status() job_id = response.json()['id'] # Poll for completion for _ in range(max_retries): status_response = requests.get( f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}', headers=headers ) status_response.raise_for_status() if status_response.json()['status'] == 'SUCCESS': # Get results result_response = requests.get( f'https://api.cloud.llamaindex.ai/api/v1/parsing/job/{job_id}/result/markdown', headers=headers ) result_response.raise_for_status() return result_response.json()['markdown'] time.sleep(delay) raise TimeoutError("Job did not complete within the maximum retry attempts") ``` -------------------------------------------------------------------------------- /benchmarks/overall/download/mistral.py: -------------------------------------------------------------------------------- ```python import io import time import requests from benchmarks.overall.download.base import Downloader class MistralDownloader(Downloader): service = "mistral" def get_html(self, pdf_bytes): rand_name = str(time.time()) + ".pdf" start = time.time() buff = io.BytesIO(pdf_bytes) md = upload_and_process_file(self.api_key, rand_name, buff) end = time.time() if isinstance(md, bytes): md = md.decode("utf-8") return { "md": md, "time": end - start, } def upload_and_process_file(api_key: str, fname: str, buff): headers = { "Authorization": f"Bearer {api_key}" } upload_headers = headers.copy() files = { 'file': (fname, buff, 'application/pdf'), 'purpose': (None, 'ocr') } upload_response = requests.post( 'https://api.mistral.ai/v1/files', headers=upload_headers, files=files ) upload_response.raise_for_status() file_id = upload_response.json()['id'] url_headers = headers.copy() url_headers["Accept"] = "application/json" url_response = requests.get( f'https://api.mistral.ai/v1/files/{file_id}/url?expiry=24', headers=url_headers ) url_response.raise_for_status() signed_url = url_response.json()['url'] ocr_headers = headers.copy() ocr_headers["Content-Type"] = "application/json" ocr_data = { "model": "mistral-ocr-latest", "document": { "type": "document_url", "document_url": signed_url }, "include_image_base64": True } ocr_response = requests.post( 'https://api.mistral.ai/v1/ocr', headers=ocr_headers, json=ocr_data ) ocr_response.raise_for_status() result = ocr_response.json() return result["pages"][0]["markdown"] ``` -------------------------------------------------------------------------------- /tests/config/test_config.py: -------------------------------------------------------------------------------- ```python import sys from contextlib import suppress import click from marker.config.printer import CustomClickPrinter from marker.config.crawler import crawler from marker.config.parser import ConfigParser def capture_kwargs(argv): command = click.command(cls=CustomClickPrinter) captured_kwargs = {} def parse_args(**kwargs): captured_kwargs.update(kwargs) return kwargs original_argv = sys.argv sys.argv = argv try: with suppress(SystemExit): command(ConfigParser.common_options(parse_args))() finally: sys.argv = original_argv return captured_kwargs def test_config_parser(): sys.argv = [ "test", "--disable_multiprocessing", "--output_dir", "output_dir", "--height_tolerance", "0.5", ] kwargs = capture_kwargs(sys.argv) parser = ConfigParser(kwargs) config_dict = parser.generate_config_dict() # Validate kwarg capturing assert kwargs["disable_multiprocessing"] assert kwargs["output_dir"] == "output_dir" assert config_dict["pdftext_workers"] == 1 # disabling multiprocessing does this assert config_dict["height_tolerance"] == 0.5 def test_config_none(): kwargs = capture_kwargs(["test"]) for key in crawler.attr_set: # We force some options to become flags for ease of use on the CLI value = None assert kwargs.get(key) is value def test_config_llm(): kwargs = capture_kwargs(["test", "--use_llm"]) parser = ConfigParser(kwargs) config_dict = parser.generate_config_dict() # Validate kwarg capturing assert config_dict["use_llm"] def test_config_force_ocr(): kwargs = capture_kwargs(["test", "--force_ocr"]) parser = ConfigParser(kwargs) config_dict = parser.generate_config_dict() # Validate kwarg capturing assert config_dict["force_ocr"] ``` -------------------------------------------------------------------------------- /tests/converters/test_ocr_converter.py: -------------------------------------------------------------------------------- ```python import pytest from marker.converters.ocr import OCRConverter from marker.renderers.ocr_json import OCRJSONOutput, OCRJSONPageOutput def _ocr_converter(config, model_dict, temp_pdf, line_count: int, eq_count: int): converter = OCRConverter(artifact_dict=model_dict, config=config) ocr_json: OCRJSONOutput = converter(temp_pdf.name) pages = ocr_json.children assert len(pages) == 1 # assert len(pages[0].children) == line_count eqs = [line for line in pages[0].children if line.block_type == "Equation"] assert len(eqs) == eq_count return pages def check_bboxes(page: OCRJSONPageOutput, lines): page_size = page.bbox for line in lines: assert len(line.children) > 0 for child in line.children: bbox = child.bbox assert all( [ bbox[0] >= page_size[0], bbox[1] >= page_size[1], bbox[2] <= page_size[2], bbox[3] <= page_size[3], ] ), "Child bbox is outside page bbox" @pytest.mark.config({"page_range": [0]}) def test_ocr_converter(config, model_dict, temp_doc): _ocr_converter(config, model_dict, temp_doc, 85, 2) @pytest.mark.filename("pres.pdf") @pytest.mark.config({"page_range": [1], "force_ocr": True, "keep_chars": True}) def test_ocr_converter_force(config, model_dict, temp_doc): pages = _ocr_converter(config, model_dict, temp_doc, 10, 0) lines = [line for line in pages[0].children if line.block_type == "Line"] check_bboxes(pages[0], lines) @pytest.mark.filename("pres.pdf") @pytest.mark.config({"page_range": [1], "keep_chars": True}) def test_ocr_converter_keep(config, model_dict, temp_doc): pages = _ocr_converter(config, model_dict, temp_doc, 10, 0) lines = [line for line in pages[0].children if line.block_type == "Line"] check_bboxes(pages[0], lines) ``` -------------------------------------------------------------------------------- /benchmarks/overall/display/dataset.py: -------------------------------------------------------------------------------- ```python import json from typing import List import datasets from tqdm import tqdm from benchmarks.overall.registry import METHOD_REGISTRY from benchmarks.overall.schema import FullResult def build_dataset(bench_dataset: datasets.Dataset, result: FullResult, score_types: List[str], max_rows: int | None = None) -> datasets.Dataset: rows = [] for idx, sample in tqdm(enumerate(bench_dataset), desc="Building dataset"): if idx not in result["markdown"]: continue if max_rows is not None and idx >= max_rows: break row = { "uuid": sample["uuid"], "classification": sample["classification"], "language": sample["language"], "img": sample["img"], } for method in result["markdown"][idx]: if method == "gt": continue method_cls = METHOD_REGISTRY[method]() md = result["markdown"][idx][method] try: method_img = method_cls.render(result["markdown"][idx][method]) except Exception as e: # This can happen when the markdown is None method_img = PIL.Image.new("RGB", (200, 200)) row[f"{method}_md"] = md row[f"{method}_img"] = method_img for score_type in score_types: try: row[f"{method}_{score_type}"] = result["scores"][idx][method][score_type]["score"] except KeyError: row[f"{method}_{score_type}"] = -1.0 # Missing score try: row[f"{method}_{score_type}_detail"] = json.dumps(result["scores"][idx][method][score_type]["specific_scores"]) except KeyError: row[f"{method}_{score_type}_detail"] = "" # Missing detail rows.append(row) ds = datasets.Dataset.from_list(rows) return ds ``` -------------------------------------------------------------------------------- /benchmarks/overall/download/base.py: -------------------------------------------------------------------------------- ```python import json from json import JSONDecodeError from pathlib import Path import datasets from tqdm import tqdm class Downloader: cache_path: Path = Path("cache") service: str def __init__(self, api_key, app_id, max_rows: int = 2200): self.cache_path.mkdir(exist_ok=True) self.max_rows = max_rows self.api_key = api_key self.app_id = app_id self.ds = datasets.load_dataset("datalab-to/marker_benchmark", split="train") def get_html(self, pdf_bytes): raise NotImplementedError def upload_ds(self): rows = [] for file in self.cache_path.glob("*.json"): with open(file, "r") as f: data = json.load(f) rows.append(data) out_ds = datasets.Dataset.from_list(rows, features=datasets.Features({ "md": datasets.Value("string"), "uuid": datasets.Value("string"), "time": datasets.Value("float"), })) out_ds.push_to_hub(f"datalab-to/marker_benchmark_{self.service}", private=True) def generate_data(self): max_rows = self.max_rows for idx, sample in tqdm(enumerate(self.ds), desc=f"Saving {self.service} results"): cache_file = self.cache_path / f"{idx}.json" if cache_file.exists(): continue pdf_bytes = sample["pdf"] # This is a single page PDF try: out_data = self.get_html(pdf_bytes) except JSONDecodeError as e: print(f"Error with sample {idx}: {e}") continue except Exception as e: print(f"Error with sample {idx}: {e}") continue out_data["uuid"] = sample["uuid"] with cache_file.open("w") as f: json.dump(out_data, f) if idx >= max_rows: break def __call__(self): self.generate_data() self.upload_ds() ``` -------------------------------------------------------------------------------- /marker/processors/reference.py: -------------------------------------------------------------------------------- ```python import numpy as np from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.blocks import Reference from marker.schema.document import Document from marker.schema.groups.list import ListGroup from marker.schema.groups.table import TableGroup from marker.schema.registry import get_block_class from marker.schema.groups.figure import FigureGroup class ReferenceProcessor(BaseProcessor): """ A processor for adding references to the document. """ def __init__(self, config): super().__init__(config) def __call__(self, document: Document): ReferenceClass: Reference = get_block_class(BlockTypes.Reference) for page in document.pages: refs = page.refs ref_starts = np.array([ref.coord for ref in refs]) blocks = [] for block_id in page.structure: block = page.get_block(block_id) if isinstance(block, (ListGroup, FigureGroup, TableGroup)): blocks.extend([page.get_block(b) for b in block.structure]) else: blocks.append(block) blocks = [b for b in blocks if not b.ignore_for_output] block_starts = np.array([block.polygon.bbox[:2] for block in blocks]) if not (len(refs) and len(block_starts)): continue distances = np.linalg.norm(block_starts[:, np.newaxis, :] - ref_starts[np.newaxis, :, :], axis=2) for ref_idx in range(len(ref_starts)): block_idx = np.argmin(distances[:, ref_idx]) block = blocks[block_idx] ref_block = page.add_full_block(ReferenceClass( ref=refs[ref_idx].ref, polygon=block.polygon, page_id=page.page_id )) if block.structure is None: block.structure = [] block.structure.insert(0, ref_block.id) ``` -------------------------------------------------------------------------------- /marker/builders/document.py: -------------------------------------------------------------------------------- ```python from typing import Annotated from marker.builders import BaseBuilder from marker.builders.layout import LayoutBuilder from marker.builders.line import LineBuilder from marker.builders.ocr import OcrBuilder from marker.providers.pdf import PdfProvider from marker.schema import BlockTypes from marker.schema.document import Document from marker.schema.groups.page import PageGroup from marker.schema.registry import get_block_class class DocumentBuilder(BaseBuilder): """ Constructs a Document given a PdfProvider, LayoutBuilder, and OcrBuilder. """ lowres_image_dpi: Annotated[ int, "DPI setting for low-resolution page images used for Layout and Line Detection.", ] = 96 highres_image_dpi: Annotated[ int, "DPI setting for high-resolution page images used for OCR.", ] = 192 disable_ocr: Annotated[ bool, "Disable OCR processing.", ] = False def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, line_builder: LineBuilder, ocr_builder: OcrBuilder): document = self.build_document(provider) layout_builder(document, provider) line_builder(document, provider) if not self.disable_ocr: ocr_builder(document, provider) return document def build_document(self, provider: PdfProvider): PageGroupClass: PageGroup = get_block_class(BlockTypes.Page) lowres_images = provider.get_images(provider.page_range, self.lowres_image_dpi) highres_images = provider.get_images(provider.page_range, self.highres_image_dpi) initial_pages = [ PageGroupClass( page_id=p, lowres_image=lowres_images[i], highres_image=highres_images[i], polygon=provider.get_page_bbox(p), refs=provider.get_page_refs(p) ) for i, p in enumerate(provider.page_range) ] DocumentClass: Document = get_block_class(BlockTypes.Document) return DocumentClass(filepath=provider.filepath, pages=initial_pages) ``` -------------------------------------------------------------------------------- /marker/services/ollama.py: -------------------------------------------------------------------------------- ```python import json from typing import Annotated, List import PIL import requests from marker.logger import get_logger from pydantic import BaseModel from marker.schema.blocks import Block from marker.services import BaseService logger = get_logger() class OllamaService(BaseService): ollama_base_url: Annotated[ str, "The base url to use for ollama. No trailing slash." ] = "http://localhost:11434" ollama_model: Annotated[str, "The model name to use for ollama."] = ( "llama3.2-vision" ) def process_images(self, images): image_bytes = [self.img_to_base64(img) for img in images] return image_bytes def __call__( self, prompt: str, image: PIL.Image.Image | List[PIL.Image.Image] | None, block: Block | None, response_schema: type[BaseModel], max_retries: int | None = None, timeout: int | None = None, ): url = f"{self.ollama_base_url}/api/generate" headers = {"Content-Type": "application/json"} schema = response_schema.model_json_schema() format_schema = { "type": "object", "properties": schema["properties"], "required": schema["required"], } image_bytes = self.format_image_for_llm(image) payload = { "model": self.ollama_model, "prompt": prompt, "stream": False, "format": format_schema, "images": image_bytes, } try: response = requests.post(url, json=payload, headers=headers) response.raise_for_status() response_data = response.json() total_tokens = ( response_data["prompt_eval_count"] + response_data["eval_count"] ) if block: block.update_metadata(llm_request_count=1, llm_tokens_used=total_tokens) data = response_data["response"] return json.loads(data) except Exception as e: logger.warning(f"Ollama inference failed: {e}") return {} ``` -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- ```toml [tool.poetry] name = "marker-pdf" version = "1.10.1" description = "Convert documents to markdown with high speed and accuracy." authors = ["Vik Paruchuri <[email protected]>"] readme = "README.md" license = "GPL-3.0-or-later" repository = "https://github.com/VikParuchuri/marker" keywords = ["pdf", "markdown", "ocr", "nlp"] packages = [ {include = "marker"} ] include = [ "marker/scripts/*.sh", "marker/scripts/*.html", ] [tool.poetry.dependencies] python = "^3.10" Pillow = "^10.1.0" pydantic = "^2.4.2" pydantic-settings = "^2.0.3" transformers = "^4.45.2" python-dotenv = "^1.0.0" torch = "^2.7.0" tqdm = "^4.66.1" ftfy = "^6.1.1" rapidfuzz = "^3.8.1" surya-ocr = "^0.17.0" regex = "^2024.4.28" pdftext = "~0.6.3" markdownify = "^1.1.0" click = "^8.2.0" markdown2 = "^2.5.2" filetype = "^1.2.0" google-genai = "^1.0.0" anthropic = "^0.46.0" pre-commit = "^4.2.0" scikit-learn = "^1.6.1" # Optional dependencies for documents mammoth = {version = "^1.9.0", optional = true} openpyxl = {version = "^3.1.5", optional = true} python-pptx = {version = "^1.0.2", optional = true} ebooklib = {version = "^0.18", optional = true} weasyprint = {version = "^63.1", optional = true} openai = "^1.65.2" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0" datasets = "^2.21.0" streamlit = "^1.37.1" fastapi = "^0.115.4" uvicorn = "^0.32.0" python-multipart = "^0.0.16" pytest = "^8.3.3" pytest-mock = "^3.14.0" apted = "1.0.3" distance = "0.1.3" lxml = "5.3.0" tabulate = "^0.9.0" latex2mathml = "^3.77.0" playwright = "^1.49.1" [tool.poetry.extras] full = ["mammoth", "openpyxl", "python-pptx", "ebooklib", "weasyprint"] [tool.poetry.scripts] marker = "marker.scripts.convert:convert_cli" marker_single = "marker.scripts.convert_single:convert_single_cli" marker_chunk_convert = "marker.scripts.chunk_convert:chunk_convert_cli" marker_gui = "marker.scripts.run_streamlit_app:streamlit_app_cli" marker_extract = "marker.scripts.run_streamlit_app:extraction_app_cli" marker_server = "marker.scripts.server:server_cli" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" ``` -------------------------------------------------------------------------------- /marker/converters/table.py: -------------------------------------------------------------------------------- ```python from typing import Tuple, List from marker.builders.document import DocumentBuilder from marker.builders.line import LineBuilder from marker.builders.ocr import OcrBuilder from marker.converters.pdf import PdfConverter from marker.processors import BaseProcessor from marker.processors.llm.llm_complex import LLMComplexRegionProcessor from marker.processors.llm.llm_form import LLMFormProcessor from marker.processors.llm.llm_table import LLMTableProcessor from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor from marker.processors.table import TableProcessor from marker.providers.registry import provider_from_filepath from marker.schema import BlockTypes class TableConverter(PdfConverter): default_processors: Tuple[BaseProcessor, ...] = ( TableProcessor, LLMTableProcessor, LLMTableMergeProcessor, LLMFormProcessor, LLMComplexRegionProcessor, ) converter_block_types: List[BlockTypes] = ( BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents, ) def build_document(self, filepath: str): provider_cls = provider_from_filepath(filepath) layout_builder = self.resolve_dependencies(self.layout_builder_class) line_builder = self.resolve_dependencies(LineBuilder) ocr_builder = self.resolve_dependencies(OcrBuilder) document_builder = DocumentBuilder(self.config) document_builder.disable_ocr = True provider = provider_cls(filepath, self.config) document = document_builder(provider, layout_builder, line_builder, ocr_builder) for page in document.pages: page.structure = [ p for p in page.structure if p.block_type in self.converter_block_types ] for processor in self.processor_list: processor(document) return document def __call__(self, filepath: str): document = self.build_document(filepath) self.page_count = len(document.pages) renderer = self.resolve_dependencies(self.renderer) return renderer(document) ``` -------------------------------------------------------------------------------- /marker/schema/blocks/basetable.py: -------------------------------------------------------------------------------- ```python from typing import List from marker.schema import BlockTypes from marker.schema.blocks import Block, BlockOutput from marker.schema.blocks.tablecell import TableCell class BaseTable(Block): block_type: BlockTypes | None = None html: str | None = None @staticmethod def format_cells( document, child_blocks, block_config, child_cells: List[TableCell] | None = None ): if child_cells is None: child_cells: List[TableCell] = [ document.get_block(c.id) for c in child_blocks if c.id.block_type == BlockTypes.TableCell ] unique_rows = sorted(list(set([c.row_id for c in child_cells]))) html_repr = "<table><tbody>" for row_id in unique_rows: row_cells = sorted( [c for c in child_cells if c.row_id == row_id], key=lambda x: x.col_id ) html_repr += "<tr>" for cell in row_cells: html_repr += cell.assemble_html( document, child_blocks, None, block_config ) html_repr += "</tr>" html_repr += "</tbody></table>" return html_repr def assemble_html( self, document, child_blocks: List[BlockOutput], parent_structure=None, block_config: dict | None = None, ): # Filter out the table cells, so they don't render twice child_ref_blocks = [ block for block in child_blocks if block.id.block_type == BlockTypes.Reference ] template = super().assemble_html( document, child_ref_blocks, parent_structure, block_config ) child_block_types = set([c.id.block_type for c in child_blocks]) if self.html: # LLM processor return template + self.html elif len(child_blocks) > 0 and BlockTypes.TableCell in child_block_types: # Table processor return template + self.format_cells(document, child_blocks, block_config) else: # Default text lines and spans return f"<p>{template}</p>" ``` -------------------------------------------------------------------------------- /benchmarks/overall/download/mathpix.py: -------------------------------------------------------------------------------- ```python import json import time import requests from benchmarks.overall.download.base import Downloader class MathpixDownloader(Downloader): service = "mathpix" def get_html(self, pdf_bytes): headers = { "app_id": self.app_id, "app_key": self.api_key, } start = time.time() pdf_id = mathpix_request(pdf_bytes, headers) status = mathpix_status(pdf_id, headers) if status in ["processing", "error"]: md = "" else: md = mathpix_results(pdf_id, headers) end = time.time() if isinstance(md, bytes): md = md.decode("utf-8") return { "md": md, "time": end - start } def mathpix_request(buffer, headers): response = requests.post("https://api.mathpix.com/v3/pdf", headers=headers, data={ "options_json": json.dumps( { "conversion_formats": { "md": True, "html": True } } ) }, files={ "file": buffer } ) data = response.json() pdf_id = data["pdf_id"] return pdf_id def mathpix_status(pdf_id, headers): max_iters = 120 i = 0 status = "processing" status2 = "processing" while i < max_iters: time.sleep(1) response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}", headers=headers ) status_resp = response.json() if "conversion_status" not in status_resp: continue status = status_resp["conversion_status"]["md"]["status"] status2 = status_resp["conversion_status"]["html"]["status"] if status == "completed" and status2 == "completed": break elif status == "error" or status2 == "error": break out_status = "completed" if status == "completed" and status2 == "completed" else "error" return out_status def mathpix_results(pdf_id, headers, ext="md"): response = requests.get(f"https://api.mathpix.com/v3/converter/{pdf_id}.{ext}", headers=headers ) return response.content ``` -------------------------------------------------------------------------------- /tests/builders/test_garbled_pdf.py: -------------------------------------------------------------------------------- ```python import pytest from marker.builders.document import DocumentBuilder from marker.builders.line import LineBuilder from marker.processors.table import TableProcessor from marker.schema import BlockTypes @pytest.mark.filename("water_damage.pdf") def test_garbled_pdf(pdf_document, recognition_model, table_rec_model, detection_model): assert pdf_document.pages[0].structure[0] == "/page/0/Table/0" table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0]) assert table_block.block_type == BlockTypes.Table assert table_block.structure[0] == "/page/0/Line/10" table_cell = pdf_document.pages[0].get_block(table_block.structure[0]) assert table_cell.block_type == BlockTypes.Line # We don't OCR in the initial pass, only with the TableProcessor processor = TableProcessor(recognition_model, table_rec_model, detection_model) processor(pdf_document) table = pdf_document.pages[0].contained_blocks(pdf_document, (BlockTypes.Table,))[0] assert "варіант" in table.raw_text(pdf_document) table_cell = pdf_document.pages[0].get_block(table_block.structure[0]) assert table_cell.block_type == BlockTypes.TableCell @pytest.mark.filename("hindi_judgement.pdf") @pytest.mark.config({"page_range": [2, 3], "disable_ocr": True}) def test_garbled_builder(config, doc_provider, detection_model, ocr_error_model): line_builder = LineBuilder(detection_model, ocr_error_model, config) builder = DocumentBuilder(config) document = builder.build_document(doc_provider) bad_ocr_results = line_builder.ocr_error_detection( document.pages, doc_provider.page_lines ) assert len(bad_ocr_results.labels) == 2 assert any([label == "bad" for label in bad_ocr_results.labels]) @pytest.mark.filename("adversarial.pdf") @pytest.mark.config({"page_range": [2, 3], "disable_ocr": True}) def test_nongarbled_builder(config, doc_provider, detection_model, ocr_error_model): line_builder = LineBuilder(detection_model, ocr_error_model, config) builder = DocumentBuilder(config) document = builder.build_document(doc_provider) bad_ocr_results = line_builder.ocr_error_detection( document.pages, doc_provider.page_lines ) assert len(bad_ocr_results.labels) == 2 assert all([label == "good" for label in bad_ocr_results.labels]) ``` -------------------------------------------------------------------------------- /marker/processors/order.py: -------------------------------------------------------------------------------- ```python from statistics import mean from collections import defaultdict from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.document import Document class OrderProcessor(BaseProcessor): """ A processor for sorting the blocks in order if needed. This can help when the layout image was sliced. """ block_types = tuple() def __call__(self, document: Document): for page in document.pages: # Skip OCRed pages if page.text_extraction_method != "pdftext": continue # Skip pages without layout slicing if not page.layout_sliced: continue block_idxs = defaultdict(int) for block_id in page.structure: block = document.get_block(block_id) spans = block.contained_blocks(document, (BlockTypes.Span, )) if len(spans) == 0: continue # Avg span position in original PDF block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2 for block_id in page.structure: # Already assigned block id via span position if block_idxs[block_id] > 0: continue block = document.get_block(block_id) prev_block = document.get_prev_block(block) next_block = document.get_next_block(block) block_idx_add = 0 if prev_block: block_idx_add = 1 while prev_block and prev_block.id not in block_idxs: prev_block = document.get_prev_block(prev_block) block_idx_add += 1 if not prev_block: block_idx_add = -1 while next_block and next_block.id not in block_idxs: next_block = document.get_next_block(next_block) block_idx_add -= 1 if not next_block and not prev_block: pass elif prev_block: block_idxs[block_id] = block_idxs[prev_block.id] + block_idx_add else: block_idxs[block_id] = block_idxs[next_block.id] + block_idx_add page.structure = sorted(page.structure, key=lambda x: block_idxs[x]) ``` -------------------------------------------------------------------------------- /tests/converters/test_extraction_converter.py: -------------------------------------------------------------------------------- ```python import json import pytest from marker.converters.extraction import ExtractionConverter from marker.extractors.page import PageExtractionSchema from marker.extractors.document import DocumentExtractionSchema from marker.services import BaseService class MockLLMService(BaseService): def __call__(self, prompt, image=None, page=None, response_schema=None, **kwargs): if response_schema == PageExtractionSchema: return { "description": "Mock extraction description", "detailed_notes": "Mock detailed notes for page extraction", } elif response_schema == DocumentExtractionSchema: return { "analysis": "Mock document analysis", "document_json": json.dumps({"test_key": "test_value"}), } return {} @pytest.fixture def mock_llm_service(): return MockLLMService @pytest.fixture def extraction_converter(config, model_dict, mock_llm_service): test_schema = { "title": "TestSchema", "type": "object", "properties": {"test_key": {"title": "Test Key", "type": "string"}}, "required": ["test_key"], } config["page_schema"] = json.dumps(test_schema) config["output_format"] = "markdown" model_dict["llm_service"] = mock_llm_service converter = ExtractionConverter( artifact_dict=model_dict, processor_list=None, config=config ) converter.llm_service = mock_llm_service converter.default_llm_service = MockLLMService return converter @pytest.mark.config({"page_range": [0]}) def test_extraction_converter(config, model_dict, mock_llm_service, temp_doc): config["page_schema"] = "invalid json" model_dict["llm_service"] = mock_llm_service converter = ExtractionConverter( artifact_dict=model_dict, processor_list=None, config=config ) converter.artifact_dict["llm_service"] = mock_llm_service() results = converter(temp_doc.name) assert results.document_json == '{"test_key": "test_value"}' @pytest.mark.config({"page_range": [0, 1]}) def test_extraction_converter_multiple_pages(extraction_converter, temp_doc): result = extraction_converter(temp_doc.name) assert result is not None assert result.document_json is not None assert json.loads(result.document_json) == {"test_key": "test_value"} assert result.analysis == "Mock document analysis" ``` -------------------------------------------------------------------------------- /marker/processors/llm/llm_meta.py: -------------------------------------------------------------------------------- ```python from concurrent.futures import ThreadPoolExecutor from typing import List, Dict, Any from marker.logger import get_logger from tqdm import tqdm from marker.processors.llm import BaseLLMSimpleBlockProcessor, BaseLLMProcessor from marker.schema.document import Document from marker.services import BaseService logger = get_logger() class LLMSimpleBlockMetaProcessor(BaseLLMProcessor): """ A wrapper for simple LLM processors, so they can all run in parallel. """ def __init__( self, processor_lst: List[BaseLLMSimpleBlockProcessor], llm_service: BaseService, config=None, ): super().__init__(llm_service, config) self.processors = processor_lst def __call__(self, document: Document): if not self.use_llm or self.llm_service is None: return total = sum( [len(processor.inference_blocks(document)) for processor in self.processors] ) pbar = tqdm( desc="LLM processors running", disable=self.disable_tqdm, total=total ) all_prompts = [ processor.block_prompts(document) for processor in self.processors ] pending = [] futures_map = {} with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor: for i, prompt_lst in enumerate(all_prompts): for prompt in prompt_lst: future = executor.submit(self.get_response, prompt) pending.append(future) futures_map[future] = {"processor_idx": i, "prompt_data": prompt} for future in pending: try: result = future.result() future_data = futures_map.pop(future) processor: BaseLLMSimpleBlockProcessor = self.processors[ future_data["processor_idx"] ] # finalize the result processor(result, future_data["prompt_data"], document) except Exception as e: logger.warning(f"Error processing LLM response: {e}") pbar.update(1) pbar.close() def get_response(self, prompt_data: Dict[str, Any]): return self.llm_service( prompt_data["prompt"], prompt_data["image"], prompt_data["block"], prompt_data["schema"], ) ``` -------------------------------------------------------------------------------- /marker/converters/__init__.py: -------------------------------------------------------------------------------- ```python import inspect from typing import Optional, List, Type from pydantic import BaseModel from marker.processors import BaseProcessor from marker.processors.llm import BaseLLMSimpleBlockProcessor from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor from marker.util import assign_config, download_font class BaseConverter: def __init__(self, config: Optional[BaseModel | dict] = None): assign_config(self, config) self.config = config self.llm_service = None # Download render font, needed for some providers download_font() def __call__(self, *args, **kwargs): raise NotImplementedError def resolve_dependencies(self, cls): init_signature = inspect.signature(cls.__init__) parameters = init_signature.parameters resolved_kwargs = {} for param_name, param in parameters.items(): if param_name == 'self': continue elif param_name == 'config': resolved_kwargs[param_name] = self.config elif param.name in self.artifact_dict: resolved_kwargs[param_name] = self.artifact_dict[param_name] elif param.default != inspect.Parameter.empty: resolved_kwargs[param_name] = param.default else: raise ValueError(f"Cannot resolve dependency for parameter: {param_name}") return cls(**resolved_kwargs) def initialize_processors(self, processor_cls_lst: List[Type[BaseProcessor]]) -> List[BaseProcessor]: processors = [] for processor_cls in processor_cls_lst: processors.append(self.resolve_dependencies(processor_cls)) simple_llm_processors = [p for p in processors if issubclass(type(p), BaseLLMSimpleBlockProcessor)] other_processors = [p for p in processors if not issubclass(type(p), BaseLLMSimpleBlockProcessor)] if not simple_llm_processors: return processors llm_positions = [i for i, p in enumerate(processors) if issubclass(type(p), BaseLLMSimpleBlockProcessor)] insert_position = max(0, llm_positions[-1] - len(simple_llm_processors) + 1) meta_processor = LLMSimpleBlockMetaProcessor( processor_lst=simple_llm_processors, llm_service=self.llm_service, config=self.config, ) other_processors.insert(insert_position, meta_processor) return other_processors ``` -------------------------------------------------------------------------------- /marker/providers/registry.py: -------------------------------------------------------------------------------- ```python import filetype import filetype.match as file_match from bs4 import BeautifulSoup from filetype.types import archive, document, IMAGE from marker.providers.document import DocumentProvider from marker.providers.epub import EpubProvider from marker.providers.html import HTMLProvider from marker.providers.image import ImageProvider from marker.providers.pdf import PdfProvider from marker.providers.powerpoint import PowerPointProvider from marker.providers.spreadsheet import SpreadSheetProvider DOCTYPE_MATCHERS = { "image": IMAGE, "pdf": [ archive.Pdf, ], "epub": [ archive.Epub, ], "doc": [document.Docx], "xls": [document.Xlsx], "ppt": [document.Pptx], } def load_matchers(doctype: str): return [cls() for cls in DOCTYPE_MATCHERS[doctype]] def load_extensions(doctype: str): return [cls.EXTENSION for cls in DOCTYPE_MATCHERS[doctype]] def provider_from_ext(filepath: str): ext = filepath.rsplit(".", 1)[-1].strip() if not ext: return PdfProvider if ext in load_extensions("image"): return ImageProvider if ext in load_extensions("pdf"): return PdfProvider if ext in load_extensions("doc"): return DocumentProvider if ext in load_extensions("xls"): return SpreadSheetProvider if ext in load_extensions("ppt"): return PowerPointProvider if ext in load_extensions("epub"): return EpubProvider if ext in ["html"]: return HTMLProvider return PdfProvider def provider_from_filepath(filepath: str): if filetype.image_match(filepath) is not None: return ImageProvider if file_match(filepath, load_matchers("pdf")) is not None: return PdfProvider if file_match(filepath, load_matchers("epub")) is not None: return EpubProvider if file_match(filepath, load_matchers("doc")) is not None: return DocumentProvider if file_match(filepath, load_matchers("xls")) is not None: return SpreadSheetProvider if file_match(filepath, load_matchers("ppt")) is not None: return PowerPointProvider try: with open(filepath, "r", encoding="utf-8") as f: soup = BeautifulSoup(f.read(), "html.parser") # Check if there are any HTML tags if bool(soup.find()): return HTMLProvider except Exception: pass # Fallback if we incorrectly detect the file type return provider_from_ext(filepath) ``` -------------------------------------------------------------------------------- /marker/processors/blank_page.py: -------------------------------------------------------------------------------- ```python from typing import Annotated from PIL import Image import numpy as np import cv2 from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.blocks import Block from marker.schema.document import Document from marker.logger import get_logger logger = get_logger() class BlankPageProcessor(BaseProcessor): """ A processor to filter out blank pages detected as a single layout block """ full_page_block_intersection_threshold: Annotated[ float, "Threshold to detect blank pages at" ] = 0.8 filter_blank_pages: Annotated[bool, "Remove blank pages detected as images."] = ( False ) def is_blank(self, image: Image.Image): image = np.asarray(image) if image.size == 0 or image.shape[0] == 0 or image.shape[1] == 0: # Handle empty image case return True gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) gray = cv2.GaussianBlur(gray, (7, 7), 0) # Adaptive threshold (inverse for text as white) binarized = cv2.adaptiveThreshold( gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15 ) num_labels, labels, stats, _ = cv2.connectedComponentsWithStats( binarized, connectivity=8 ) cleaned = np.zeros_like(binarized) for i in range(1, num_labels): # skip background cleaned[labels == i] = 255 kernel = np.ones((1, 5), np.uint8) dilated = cv2.dilate(cleaned, kernel, iterations=3) b = dilated / 255 return b.sum() == 0 def __call__(self, document: Document): if not self.filter_blank_pages: return for page in document.pages: structure_blocks = page.structure_blocks(document) if not structure_blocks or len(structure_blocks) > 1: continue full_page_block: Block = structure_blocks[0] conditions = [ full_page_block.block_type in [BlockTypes.Picture, BlockTypes.Figure], self.is_blank(full_page_block.get_image(document)), page.polygon.intersection_area(full_page_block.polygon) > self.full_page_block_intersection_threshold, ] if all(conditions): logger.debug(f"Removing blank block {full_page_block.id}") page.remove_structure_items([full_page_block.id]) full_page_block.removed = True ``` -------------------------------------------------------------------------------- /marker/processors/util.py: -------------------------------------------------------------------------------- ```python import re from bs4 import BeautifulSoup from marker.schema import BlockTypes from marker.schema.groups import PageGroup from marker.schema.registry import get_block_class from marker.schema.text import Line def escape_latex_commands(text: str): text = (text .replace('\n', '\\n') .replace('\t', '\\t') .replace('\r', '\\r')) return text def add_math_spans_to_line(corrected_text: str, text_line: Line, page: PageGroup): SpanClass = get_block_class(BlockTypes.Span) corrected_spans = text_to_spans(corrected_text) for span_idx, span in enumerate(corrected_spans): if span_idx == len(corrected_spans) - 1: span['content'] += "\n" span_block = page.add_full_block( SpanClass( polygon=text_line.polygon, text=span['content'], font='Unknown', font_weight=0, font_size=0, minimum_position=0, maximum_position=0, formats=[span['type']], url=span.get('url'), page_id=text_line.page_id, text_extraction_method="gemini", has_superscript=span["has_superscript"], has_subscript=span["has_subscript"] ) ) text_line.structure.append(span_block.id) def text_to_spans(text): soup = BeautifulSoup(text, 'html.parser') tag_types = { 'b': 'bold', 'i': 'italic', 'math': 'math', 'sub': 'plain', 'sup': 'plain', 'span': 'plain' } spans = [] for element in soup.descendants: if not len(list(element.parents)) == 1: continue url = element.attrs.get('href') if hasattr(element, 'attrs') else None if element.name in tag_types: text = element.get_text() if element.name == "math": text = escape_latex_commands(text) spans.append({ 'type': tag_types[element.name], 'content': text, 'url': url, "has_superscript": element.name == "sup", "has_subscript": element.name == "sub" }) elif element.string: spans.append({ 'type': 'plain', 'content': element.string, 'url': url, "has_superscript": False, "has_subscript": False }) return spans ``` -------------------------------------------------------------------------------- /marker/providers/__init__.py: -------------------------------------------------------------------------------- ```python from copy import deepcopy from typing import List, Optional, Dict from PIL import Image from pydantic import BaseModel from pdftext.schema import Reference from marker.logger import configure_logging from marker.schema.polygon import PolygonBox from marker.schema.text import Span from marker.schema.text.char import Char from marker.schema.text.line import Line from marker.settings import settings from marker.util import assign_config configure_logging() class ProviderOutput(BaseModel): line: Line spans: List[Span] chars: Optional[List[List[Char]]] = None @property def raw_text(self): return "".join(span.text for span in self.spans) def __hash__(self): return hash(tuple(self.line.polygon.bbox)) def merge(self, other: "ProviderOutput"): new_output = deepcopy(self) other_copy = deepcopy(other) new_output.spans.extend(other_copy.spans) if new_output.chars is not None and other_copy.chars is not None: new_output.chars.extend(other_copy.chars) elif other_copy.chars is not None: new_output.chars = other_copy.chars new_output.line.polygon = new_output.line.polygon.merge( [other_copy.line.polygon] ) return new_output ProviderPageLines = Dict[int, List[ProviderOutput]] class BaseProvider: def __init__(self, filepath: str, config: Optional[BaseModel | dict] = None): assign_config(self, config) self.filepath = filepath def __len__(self): pass def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]: pass def get_page_bbox(self, idx: int) -> PolygonBox | None: pass def get_page_lines(self, idx: int) -> List[Line]: pass def get_page_refs(self, idx: int) -> List[Reference]: pass def __enter__(self): return self @staticmethod def get_font_css(): from weasyprint import CSS from weasyprint.text.fonts import FontConfiguration font_config = FontConfiguration() css = CSS( string=f""" @font-face {{ font-family: GoNotoCurrent-Regular; src: url({settings.FONT_PATH}); font-display: swap; }} body {{ font-family: {settings.FONT_NAME.split(".")[0]}, sans-serif; font-variant-ligatures: none; font-feature-settings: "liga" 0; text-rendering: optimizeLegibility; }} """, font_config=font_config, ) return css ``` -------------------------------------------------------------------------------- /marker/providers/document.py: -------------------------------------------------------------------------------- ```python import base64 import os import re import tempfile from io import BytesIO from PIL import Image from marker.logger import get_logger from marker.providers.pdf import PdfProvider logger = get_logger() css = """ @page { size: A4; margin: 2cm; } img { max-width: 100%; max-height: 25cm; object-fit: contain; margin: 12pt auto; } div, p { max-width: 100%; word-break: break-word; font-size: 10pt; } table { width: 100%; border-collapse: collapse; break-inside: auto; font-size: 10pt; } tr { break-inside: avoid; page-break-inside: avoid; } td { border: 0.75pt solid #000; padding: 6pt; } """ class DocumentProvider(PdfProvider): def __init__(self, filepath: str, config=None): temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") self.temp_pdf_path = temp_pdf.name temp_pdf.close() # Convert DOCX to PDF try: self.convert_docx_to_pdf(filepath) except Exception as e: raise RuntimeError(f"Failed to convert {filepath} to PDF: {e}") # Initialize the PDF provider with the temp pdf path super().__init__(self.temp_pdf_path, config) def __del__(self): if os.path.exists(self.temp_pdf_path): os.remove(self.temp_pdf_path) def convert_docx_to_pdf(self, filepath: str): from weasyprint import CSS, HTML import mammoth with open(filepath, "rb") as docx_file: # we convert the docx to HTML result = mammoth.convert_to_html(docx_file) html = result.value # We convert the HTML into a PDF HTML(string=self._preprocess_base64_images(html)).write_pdf( self.temp_pdf_path, stylesheets=[CSS(string=css), self.get_font_css()] ) @staticmethod def _preprocess_base64_images(html_content): pattern = r'data:([^;]+);base64,([^"\'>\s]+)' def convert_image(match): try: img_data = base64.b64decode(match.group(2)) with BytesIO(img_data) as bio: with Image.open(bio) as img: output = BytesIO() img.save(output, format=img.format) new_base64 = base64.b64encode(output.getvalue()).decode() return f"data:{match.group(1)};base64,{new_base64}" except Exception as e: logger.error(f"Failed to process image: {e}") return "" # we ditch broken images as that breaks the PDF creation down the line return re.sub(pattern, convert_image, html_content) ``` -------------------------------------------------------------------------------- /benchmarks/overall/methods/olmocr.py: -------------------------------------------------------------------------------- ```python import base64 import json import tempfile import time from io import BytesIO import torch from PIL import Image from benchmarks.overall.methods import BaseMethod, BenchmarkResult def convert_single_page(filename: str, model, processor, device): from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.prompts import build_finetuning_prompt from olmocr.prompts.anchor import get_anchor_text image_base64 = render_pdf_to_base64png(filename, 1, target_longest_image_dim=1024) # Build the prompt, using document metadata anchor_text = get_anchor_text(filename, 1, pdf_engine="pdfreport", target_length=4000) prompt = build_finetuning_prompt(anchor_text) # Build the full prompt messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, ], } ] # Apply the chat template and processor text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) main_image = Image.open(BytesIO(base64.b64decode(image_base64))) inputs = processor( text=[text], images=[main_image], padding=True, return_tensors="pt", ) inputs = {key: value.to(device) for (key, value) in inputs.items()} # Generate the output output = model.generate( **inputs, temperature=0.8, max_new_tokens=8192, num_return_sequences=1, do_sample=True, ) # Decode the output prompt_length = inputs["input_ids"].shape[1] new_tokens = output[:, prompt_length:] text_output = processor.tokenizer.batch_decode( new_tokens, skip_special_tokens=True )[0] try: text_output = json.loads(text_output) text = text_output["natural_text"] except Exception: try: text = text_output.split("natural_text")[1].strip() except Exception: text = "" return text class OlmOCRMethod(BaseMethod): olmocr_model: dict = None use_llm: bool = False def __call__(self, sample) -> BenchmarkResult: pdf_bytes = sample["pdf"] # This is a single page PDF with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f: f.write(pdf_bytes) start = time.time() result = convert_single_page(f.name, self.olmocr_model["model"], self.olmocr_model["processor"], self.olmocr_model["model"].device) total = time.time() - start return { "markdown": result, "time": total } ``` -------------------------------------------------------------------------------- /tests/renderers/test_markdown_renderer.py: -------------------------------------------------------------------------------- ```python import pytest from marker.renderers.markdown import MarkdownRenderer from marker.schema import BlockTypes from marker.schema.blocks import TableCell @pytest.mark.config({"page_range": [0], "disable_ocr": True}) def test_markdown_renderer(pdf_document): renderer = MarkdownRenderer() md = renderer(pdf_document).markdown # Verify markdown assert "# Subspace Adversarial Training" in md @pytest.mark.config({"page_range": [0]}) def test_markdown_renderer_auto_ocr(pdf_document): renderer = MarkdownRenderer() md = renderer(pdf_document).markdown # Verify markdown assert "Subspace Adversarial Training" in md @pytest.mark.config({"page_range": [0, 1], "paginate_output": True}) def test_markdown_renderer_pagination(pdf_document): renderer = MarkdownRenderer({"paginate_output": True}) md = renderer(pdf_document).markdown assert "\n\n{0}-" in md assert "\n\n{1}-" in md @pytest.mark.config({"page_range": [0, 1], "paginate_output": True}) def test_markdown_renderer_pagination_blank_last_page(pdf_document): # Clear all children and structure from the last page to simulate a blank page last_page = pdf_document.pages[-1] last_page.children = [] last_page.structure = [] renderer = MarkdownRenderer({"paginate_output": True}) md = renderer(pdf_document).markdown # Should end with pagination marker and preserve trailing newlines assert md.endswith("}\n\n") or md.endswith( "}------------------------------------------------\n\n" ) @pytest.mark.config({"page_range": [0, 1]}) def test_markdown_renderer_metadata(pdf_document): renderer = MarkdownRenderer({"paginate_output": True}) metadata = renderer(pdf_document).metadata assert "table_of_contents" in metadata @pytest.mark.config({"page_range": [0, 1]}) def test_markdown_renderer_images(pdf_document): renderer = MarkdownRenderer({"extract_images": False}) markdown_output = renderer(pdf_document) assert len(markdown_output.images) == 0 assert " def test_markdown_renderer_tables(pdf_document): table = pdf_document.contained_blocks((BlockTypes.Table,))[0] page = pdf_document.pages[0] cell = TableCell( polygon=table.polygon, text_lines=["54<i>.45</i>67<br>89<math>x</math>"], rowspan=1, colspan=1, row_id=0, col_id=0, is_header=False, page_id=page.page_id, ) page.add_full_block(cell) table.structure = [] table.add_structure(cell) renderer = MarkdownRenderer() md = renderer(pdf_document).markdown assert "54 <i>.45</i> 67<br>89 $x$" in md ``` -------------------------------------------------------------------------------- /marker/schema/registry.py: -------------------------------------------------------------------------------- ```python from typing import Dict, Type from importlib import import_module from marker.schema import BlockTypes from marker.schema.blocks import ( Block, Caption, Code, Equation, Figure, Footnote, Form, Handwriting, InlineMath, ListItem, PageFooter, PageHeader, Picture, SectionHeader, Table, TableOfContents, Text, ComplexRegion, TableCell, Reference, ) from marker.schema.document import Document from marker.schema.groups import ( FigureGroup, ListGroup, PageGroup, PictureGroup, TableGroup, ) from marker.schema.text import Line, Span from marker.schema.text.char import Char BLOCK_REGISTRY: Dict[BlockTypes, str] = {} def register_block_class(block_type: BlockTypes, block_cls: Type[Block]): BLOCK_REGISTRY[block_type] = f"{block_cls.__module__}.{block_cls.__name__}" def get_block_class(block_type: BlockTypes) -> Type[Block]: class_path = BLOCK_REGISTRY[block_type] module_name, class_name = class_path.rsplit(".", 1) module = import_module(module_name) return getattr(module, class_name) register_block_class(BlockTypes.Line, Line) register_block_class(BlockTypes.Span, Span) register_block_class(BlockTypes.Char, Char) register_block_class(BlockTypes.FigureGroup, FigureGroup) register_block_class(BlockTypes.TableGroup, TableGroup) register_block_class(BlockTypes.ListGroup, ListGroup) register_block_class(BlockTypes.PictureGroup, PictureGroup) register_block_class(BlockTypes.Page, PageGroup) register_block_class(BlockTypes.Caption, Caption) register_block_class(BlockTypes.Code, Code) register_block_class(BlockTypes.Figure, Figure) register_block_class(BlockTypes.Footnote, Footnote) register_block_class(BlockTypes.Form, Form) register_block_class(BlockTypes.Equation, Equation) register_block_class(BlockTypes.Handwriting, Handwriting) register_block_class(BlockTypes.TextInlineMath, InlineMath) register_block_class(BlockTypes.ListItem, ListItem) register_block_class(BlockTypes.PageFooter, PageFooter) register_block_class(BlockTypes.PageHeader, PageHeader) register_block_class(BlockTypes.Picture, Picture) register_block_class(BlockTypes.SectionHeader, SectionHeader) register_block_class(BlockTypes.Table, Table) register_block_class(BlockTypes.Text, Text) register_block_class(BlockTypes.TableOfContents, TableOfContents) register_block_class(BlockTypes.ComplexRegion, ComplexRegion) register_block_class(BlockTypes.TableCell, TableCell) register_block_class(BlockTypes.Reference, Reference) register_block_class(BlockTypes.Document, Document) assert len(BLOCK_REGISTRY) == len(BlockTypes) assert all( [ get_block_class(k).model_fields["block_type"].default == k for k, _ in BLOCK_REGISTRY.items() ] ) ``` -------------------------------------------------------------------------------- /marker/processors/blockquote.py: -------------------------------------------------------------------------------- ```python from typing import Annotated, Tuple from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.document import Document class BlockquoteProcessor(BaseProcessor): """ A processor for tagging blockquotes. """ block_types: Annotated[ Tuple[BlockTypes], "The block types to process.", ] = (BlockTypes.Text, BlockTypes.TextInlineMath) min_x_indent: Annotated[ float, "The minimum horizontal indentation required to consider a block as part of a blockquote.", "Expressed as a percentage of the block width.", ] = 0.1 x_start_tolerance: Annotated[ float, "The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.", "Expressed as a percentage of the block width.", ] = 0.01 x_end_tolerance: Annotated[ float, "The maximum allowable difference between the ending x-coordinates of consecutive blocks to consider them aligned.", "Expressed as a percentage of the block width.", ] = 0.01 def __init__(self, config): super().__init__(config) def __call__(self, document: Document): for page in document.pages: for block in page.contained_blocks(document, self.block_types): if block.structure is None: continue if not len(block.structure) >= 2: continue next_block = page.get_next_block(block) if next_block is None: continue if next_block.block_type not in self.block_types: continue if next_block.structure is None: continue if next_block.ignore_for_output: continue matching_x_end = abs(next_block.polygon.x_end - block.polygon.x_end) < self.x_end_tolerance * block.polygon.width matching_x_start = abs(next_block.polygon.x_start - block.polygon.x_start) < self.x_start_tolerance * block.polygon.width x_indent = next_block.polygon.x_start > block.polygon.x_start + (self.min_x_indent * block.polygon.width) y_indent = next_block.polygon.y_start > block.polygon.y_end if block.blockquote: next_block.blockquote = (matching_x_end and matching_x_start) or (x_indent and y_indent) next_block.blockquote_level = block.blockquote_level if (x_indent and y_indent): next_block.blockquote_level += 1 elif len(next_block.structure) >= 2 and (x_indent and y_indent): next_block.blockquote = True next_block.blockquote_level = 1 ``` -------------------------------------------------------------------------------- /tests/services/test_service_init.py: -------------------------------------------------------------------------------- ```python import pytest from marker.converters.pdf import PdfConverter from marker.services.gemini import GoogleGeminiService from marker.services.ollama import OllamaService from marker.services.vertex import GoogleVertexService from marker.services.openai import OpenAIService from marker.services.azure_openai import AzureOpenAIService @pytest.mark.output_format("markdown") @pytest.mark.config({"page_range": [0]}) def test_empty_llm(pdf_converter: PdfConverter, temp_doc): assert pdf_converter.artifact_dict["llm_service"] is None assert pdf_converter.llm_service is None def test_llm_no_keys(model_dict, config): with pytest.raises(AssertionError): PdfConverter(artifact_dict=model_dict, config={"use_llm": True}) @pytest.mark.output_format("markdown") @pytest.mark.config({"page_range": [0], "use_llm": True, "gemini_api_key": "test"}) def test_llm_gemini(pdf_converter: PdfConverter, temp_doc): assert pdf_converter.artifact_dict["llm_service"] is not None assert isinstance(pdf_converter.llm_service, GoogleGeminiService) @pytest.mark.output_format("markdown") @pytest.mark.config( { "page_range": [0], "use_llm": True, "vertex_project_id": "test", "llm_service": "marker.services.vertex.GoogleVertexService", } ) def test_llm_vertex(pdf_converter: PdfConverter, temp_doc): assert pdf_converter.artifact_dict["llm_service"] is not None assert isinstance(pdf_converter.llm_service, GoogleVertexService) @pytest.mark.output_format("markdown") @pytest.mark.config( { "page_range": [0], "use_llm": True, "llm_service": "marker.services.ollama.OllamaService", } ) def test_llm_ollama(pdf_converter: PdfConverter, temp_doc): assert pdf_converter.artifact_dict["llm_service"] is not None assert isinstance(pdf_converter.llm_service, OllamaService) @pytest.mark.output_format("markdown") @pytest.mark.config( { "page_range": [0], "use_llm": True, "llm_service": "marker.services.openai.OpenAIService", "openai_api_key": "test", } ) def test_llm_openai(pdf_converter: PdfConverter, temp_doc): assert pdf_converter.artifact_dict["llm_service"] is not None assert isinstance(pdf_converter.llm_service, OpenAIService) @pytest.mark.output_format("markdown") @pytest.mark.config( { "page_range": [0], "use_llm": True, "llm_service": "marker.services.azure_openai.AzureOpenAIService", "azure_endpoint": "https://example.openai.azure.com", "azure_api_key": "test", "deployment_name": "test-model", "azure_api_version": "1", } ) def test_llm_azure_openai(pdf_converter: PdfConverter, temp_doc): assert pdf_converter.artifact_dict["llm_service"] is not None assert isinstance(pdf_converter.llm_service, AzureOpenAIService) ``` -------------------------------------------------------------------------------- /marker/converters/extraction.py: -------------------------------------------------------------------------------- ```python import re from typing import Annotated from marker.builders.document import DocumentBuilder from marker.builders.line import LineBuilder from marker.builders.ocr import OcrBuilder from marker.builders.structure import StructureBuilder from marker.converters.pdf import PdfConverter from marker.extractors.document import DocumentExtractor from marker.extractors.page import PageExtractor from marker.providers.registry import provider_from_filepath from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput from marker.renderers.markdown import MarkdownRenderer from marker.logger import get_logger logger = get_logger() class ExtractionConverter(PdfConverter): pattern: str = r"{\d+\}-{48}\n\n" existing_markdown: Annotated[ str, "Markdown that was already converted for extraction." ] = None def build_document(self, filepath: str): provider_cls = provider_from_filepath(filepath) layout_builder = self.resolve_dependencies(self.layout_builder_class) line_builder = self.resolve_dependencies(LineBuilder) ocr_builder = self.resolve_dependencies(OcrBuilder) provider = provider_cls(filepath, self.config) document = DocumentBuilder(self.config)( provider, layout_builder, line_builder, ocr_builder ) structure_builder_cls = self.resolve_dependencies(StructureBuilder) structure_builder_cls(document) for processor in self.processor_list: processor(document) return document, provider def __call__(self, filepath: str) -> ExtractionOutput: self.config["paginate_output"] = True # Ensure we can split the output properly self.config["output_format"] = ( "markdown" # Output must be markdown for extraction ) markdown = self.existing_markdown if not markdown: document, provider = self.build_document(filepath) self.page_count = len(document.pages) renderer = self.resolve_dependencies(MarkdownRenderer) output = renderer(document) markdown = output.markdown output_pages = re.split(self.pattern, markdown)[1:] # Split output into pages # This needs an LLM service for extraction, this sets it in the extractor if self.artifact_dict.get("llm_service") is None: self.artifact_dict["llm_service"] = self.resolve_dependencies( self.default_llm_service ) page_extractor = self.resolve_dependencies(PageExtractor) document_extractor = self.resolve_dependencies(DocumentExtractor) renderer = self.resolve_dependencies(ExtractionRenderer) # Inference in parallel notes = page_extractor(output_pages) document_output = document_extractor(notes) merged = renderer(document_output, markdown) return merged ```