datalab-to/marker # codebase.md

This is page 3 of 9. Use http://codebase.md/datalab-to/marker?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .github
│   ├── ISSUE_TEMPLATE
│   │   ├── breaking-bug-report.md
│   │   ├── feature_request.md
│   │   └── output-bug-report.md
│   └── workflows
│       ├── benchmarks.yml
│       ├── ci.yml
│       ├── cla.yml
│       ├── publish.yml
│       └── scripts.yml
├── .gitignore
├── .pre-commit-config.yaml
├── benchmarks
│   ├── __init__.py
│   ├── overall
│   │   ├── __init__.py
│   │   ├── display
│   │   │   ├── __init__.py
│   │   │   ├── dataset.py
│   │   │   └── table.py
│   │   ├── download
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── llamaparse.py
│   │   │   ├── main.py
│   │   │   ├── mathpix.py
│   │   │   └── mistral.py
│   │   ├── elo.py
│   │   ├── methods
│   │   │   ├── __init__.py
│   │   │   ├── docling.py
│   │   │   ├── gt.py
│   │   │   ├── llamaparse.py
│   │   │   ├── marker.py
│   │   │   ├── mathpix.py
│   │   │   ├── mistral.py
│   │   │   ├── olmocr.py
│   │   │   └── schema.py
│   │   ├── overall.py
│   │   ├── registry.py
│   │   ├── schema.py
│   │   └── scorers
│   │       ├── __init__.py
│   │       ├── clean.py
│   │       ├── heuristic.py
│   │       ├── llm.py
│   │       └── schema.py
│   ├── table
│   │   ├── __init__.py
│   │   ├── gemini.py
│   │   ├── inference.py
│   │   ├── scoring.py
│   │   └── table.py
│   ├── throughput
│   │   ├── __init__.py
│   │   └── main.py
│   └── verify_scores.py
├── chunk_convert.py
├── CLA.md
├── convert_single.py
├── convert.py
├── data
│   ├── .gitignore
│   ├── examples
│   │   ├── json
│   │   │   ├── multicolcnn.json
│   │   │   ├── switch_trans.json
│   │   │   └── thinkpython.json
│   │   └── markdown
│   │       ├── multicolcnn
│   │       │   ├── _page_1_Figure_0.jpeg
│   │       │   ├── _page_2_Picture_0.jpeg
│   │       │   ├── _page_6_Figure_0.jpeg
│   │       │   ├── _page_7_Figure_0.jpeg
│   │       │   ├── multicolcnn_meta.json
│   │       │   └── multicolcnn.md
│   │       ├── switch_transformers
│   │       │   ├── _page_11_Figure_4.jpeg
│   │       │   ├── _page_12_Figure_4.jpeg
│   │       │   ├── _page_13_Figure_2.jpeg
│   │       │   ├── _page_18_Figure_1.jpeg
│   │       │   ├── _page_18_Figure_3.jpeg
│   │       │   ├── _page_2_Figure_3.jpeg
│   │       │   ├── _page_20_Figure_1.jpeg
│   │       │   ├── _page_20_Figure_4.jpeg
│   │       │   ├── _page_27_Figure_1.jpeg
│   │       │   ├── _page_29_Figure_1.jpeg
│   │       │   ├── _page_30_Figure_1.jpeg
│   │       │   ├── _page_31_Figure_3.jpeg
│   │       │   ├── _page_4_Figure_1.jpeg
│   │       │   ├── _page_5_Figure_3.jpeg
│   │       │   ├── switch_trans_meta.json
│   │       │   └── switch_trans.md
│   │       └── thinkpython
│   │           ├── _page_109_Figure_1.jpeg
│   │           ├── _page_115_Figure_1.jpeg
│   │           ├── _page_116_Figure_3.jpeg
│   │           ├── _page_127_Figure_1.jpeg
│   │           ├── _page_128_Figure_1.jpeg
│   │           ├── _page_167_Figure_1.jpeg
│   │           ├── _page_169_Figure_1.jpeg
│   │           ├── _page_173_Figure_1.jpeg
│   │           ├── _page_190_Figure_1.jpeg
│   │           ├── _page_195_Figure_1.jpeg
│   │           ├── _page_205_Figure_1.jpeg
│   │           ├── _page_23_Figure_1.jpeg
│   │           ├── _page_23_Figure_3.jpeg
│   │           ├── _page_230_Figure_1.jpeg
│   │           ├── _page_233_Figure_1.jpeg
│   │           ├── _page_233_Figure_3.jpeg
│   │           ├── _page_234_Figure_1.jpeg
│   │           ├── _page_235_Figure_1.jpeg
│   │           ├── _page_236_Figure_1.jpeg
│   │           ├── _page_236_Figure_3.jpeg
│   │           ├── _page_237_Figure_1.jpeg
│   │           ├── _page_238_Figure_1.jpeg
│   │           ├── _page_46_Figure_1.jpeg
│   │           ├── _page_60_Figure_1.jpeg
│   │           ├── _page_60_Figure_3.jpeg
│   │           ├── _page_67_Figure_1.jpeg
│   │           ├── _page_71_Figure_1.jpeg
│   │           ├── _page_78_Figure_1.jpeg
│   │           ├── _page_85_Figure_1.jpeg
│   │           ├── _page_94_Figure_1.jpeg
│   │           ├── _page_99_Figure_17.jpeg
│   │           ├── _page_99_Figure_178.jpeg
│   │           ├── thinkpython_meta.json
│   │           └── thinkpython.md
│   ├── images
│   │   ├── overall.png
│   │   ├── per_doc.png
│   │   └── table.png
│   └── latex_to_md.sh
├── examples
│   ├── marker_modal_deployment.py
│   └── README.md
├── extraction_app.py
├── LICENSE
├── marker
│   ├── builders
│   │   ├── __init__.py
│   │   ├── document.py
│   │   ├── layout.py
│   │   ├── line.py
│   │   ├── ocr.py
│   │   └── structure.py
│   ├── config
│   │   ├── __init__.py
│   │   ├── crawler.py
│   │   ├── parser.py
│   │   └── printer.py
│   ├── converters
│   │   ├── __init__.py
│   │   ├── extraction.py
│   │   ├── ocr.py
│   │   ├── pdf.py
│   │   └── table.py
│   ├── extractors
│   │   ├── __init__.py
│   │   ├── document.py
│   │   └── page.py
│   ├── logger.py
│   ├── models.py
│   ├── output.py
│   ├── processors
│   │   ├── __init__.py
│   │   ├── blank_page.py
│   │   ├── block_relabel.py
│   │   ├── blockquote.py
│   │   ├── code.py
│   │   ├── debug.py
│   │   ├── document_toc.py
│   │   ├── equation.py
│   │   ├── footnote.py
│   │   ├── ignoretext.py
│   │   ├── line_merge.py
│   │   ├── line_numbers.py
│   │   ├── list.py
│   │   ├── llm
│   │   │   ├── __init__.py
│   │   │   ├── llm_complex.py
│   │   │   ├── llm_equation.py
│   │   │   ├── llm_form.py
│   │   │   ├── llm_handwriting.py
│   │   │   ├── llm_image_description.py
│   │   │   ├── llm_mathblock.py
│   │   │   ├── llm_meta.py
│   │   │   ├── llm_page_correction.py
│   │   │   ├── llm_sectionheader.py
│   │   │   ├── llm_table_merge.py
│   │   │   └── llm_table.py
│   │   ├── order.py
│   │   ├── page_header.py
│   │   ├── reference.py
│   │   ├── sectionheader.py
│   │   ├── table.py
│   │   ├── text.py
│   │   └── util.py
│   ├── providers
│   │   ├── __init__.py
│   │   ├── document.py
│   │   ├── epub.py
│   │   ├── html.py
│   │   ├── image.py
│   │   ├── pdf.py
│   │   ├── powerpoint.py
│   │   ├── registry.py
│   │   ├── spreadsheet.py
│   │   └── utils.py
│   ├── renderers
│   │   ├── __init__.py
│   │   ├── chunk.py
│   │   ├── extraction.py
│   │   ├── html.py
│   │   ├── json.py
│   │   ├── markdown.py
│   │   └── ocr_json.py
│   ├── schema
│   │   ├── __init__.py
│   │   ├── blocks
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── basetable.py
│   │   │   ├── caption.py
│   │   │   ├── code.py
│   │   │   ├── complexregion.py
│   │   │   ├── equation.py
│   │   │   ├── figure.py
│   │   │   ├── footnote.py
│   │   │   ├── form.py
│   │   │   ├── handwriting.py
│   │   │   ├── inlinemath.py
│   │   │   ├── listitem.py
│   │   │   ├── pagefooter.py
│   │   │   ├── pageheader.py
│   │   │   ├── picture.py
│   │   │   ├── reference.py
│   │   │   ├── sectionheader.py
│   │   │   ├── table.py
│   │   │   ├── tablecell.py
│   │   │   ├── text.py
│   │   │   └── toc.py
│   │   ├── document.py
│   │   ├── groups
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── figure.py
│   │   │   ├── list.py
│   │   │   ├── page.py
│   │   │   ├── picture.py
│   │   │   └── table.py
│   │   ├── polygon.py
│   │   ├── registry.py
│   │   └── text
│   │       ├── __init__.py
│   │       ├── char.py
│   │       ├── line.py
│   │       └── span.py
│   ├── scripts
│   │   ├── __init__.py
│   │   ├── chunk_convert.py
│   │   ├── chunk_convert.sh
│   │   ├── common.py
│   │   ├── convert_single.py
│   │   ├── convert.py
│   │   ├── extraction_app.py
│   │   ├── file_to_s3.py
│   │   ├── run_streamlit_app.py
│   │   ├── server.py
│   │   └── streamlit_app.py
│   ├── services
│   │   ├── __init__.py
│   │   ├── azure_openai.py
│   │   ├── claude.py
│   │   ├── gemini.py
│   │   ├── ollama.py
│   │   ├── openai.py
│   │   └── vertex.py
│   ├── settings.py
│   ├── util.py
│   └── utils
│       ├── __init__.py
│       ├── batch.py
│       ├── gpu.py
│       └── image.py
├── marker_app.py
├── marker_server.py
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── README.md
├── signatures
│   └── version1
│       └── cla.json
├── static
│   └── fonts
│       └── .gitignore
└── tests
    ├── builders
    │   ├── test_blank_page.py
    │   ├── test_document_builder.py
    │   ├── test_garbled_pdf.py
    │   ├── test_layout_replace.py
    │   ├── test_ocr_builder.py
    │   ├── test_ocr_pipeline.py
    │   ├── test_overriding.py
    │   ├── test_pdf_links.py
    │   ├── test_rotated_bboxes.py
    │   ├── test_strip_existing_ocr.py
    │   └── test_structure.py
    ├── config
    │   └── test_config.py
    ├── conftest.py
    ├── converters
    │   ├── test_extraction_converter.py
    │   ├── test_ocr_converter.py
    │   ├── test_pdf_converter.py
    │   └── test_table_converter.py
    ├── processors
    │   ├── test_document_toc_processor.py
    │   ├── test_equation_processor.py
    │   ├── test_footnote_processor.py
    │   ├── test_ignoretext.py
    │   ├── test_llm_processors.py
    │   ├── test_table_merge.py
    │   └── test_table_processor.py
    ├── providers
    │   ├── test_document_providers.py
    │   ├── test_image_provider.py
    │   └── test_pdf_provider.py
    ├── renderers
    │   ├── test_chunk_renderer.py
    │   ├── test_extract_images.py
    │   ├── test_html_renderer.py
    │   ├── test_json_renderer.py
    │   └── test_markdown_renderer.py
    ├── schema
    │   └── groups
    │       └── test_list_grouping.py
    ├── services
    │   └── test_service_init.py
    └── utils.py
```

# Files

--------------------------------------------------------------------------------
/marker/services/claude.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import time
  3 | from typing import List, Annotated, T
  4 | 
  5 | import PIL
  6 | from PIL import Image
  7 | import anthropic
  8 | from anthropic import RateLimitError, APITimeoutError
  9 | from marker.logger import get_logger
 10 | from pydantic import BaseModel
 11 | 
 12 | from marker.schema.blocks import Block
 13 | from marker.services import BaseService
 14 | 
 15 | logger = get_logger()
 16 | 
 17 | 
 18 | class ClaudeService(BaseService):
 19 |     claude_model_name: Annotated[
 20 |         str, "The name of the Google model to use for the service."
 21 |     ] = "claude-3-7-sonnet-20250219"
 22 |     claude_api_key: Annotated[str, "The Claude API key to use for the service."] = None
 23 |     max_claude_tokens: Annotated[
 24 |         int, "The maximum number of tokens to use for a single Claude request."
 25 |     ] = 8192
 26 | 
 27 |     def process_images(self, images: List[Image.Image]) -> List[dict]:
 28 |         return [
 29 |             {
 30 |                 "type": "image",
 31 |                 "source": {
 32 |                     "type": "base64",
 33 |                     "media_type": "image/webp",
 34 |                     "data": self.img_to_base64(img),
 35 |                 },
 36 |             }
 37 |             for img in images
 38 |         ]
 39 | 
 40 |     def validate_response(self, response_text: str, schema: type[T]) -> T:
 41 |         response_text = response_text.strip()
 42 |         if response_text.startswith("```json"):
 43 |             response_text = response_text[7:]
 44 |         if response_text.endswith("```"):
 45 |             response_text = response_text[:-3]
 46 | 
 47 |         try:
 48 |             # Try to parse as JSON first
 49 |             out_schema = schema.model_validate_json(response_text)
 50 |             out_json = out_schema.model_dump()
 51 |             return out_json
 52 |         except Exception:
 53 |             try:
 54 |                 # Re-parse with fixed escapes
 55 |                 escaped_str = response_text.replace("\\", "\\\\")
 56 |                 out_schema = schema.model_validate_json(escaped_str)
 57 |                 return out_schema.model_dump()
 58 |             except Exception:
 59 |                 return
 60 | 
 61 |     def get_client(self):
 62 |         return anthropic.Anthropic(
 63 |             api_key=self.claude_api_key,
 64 |         )
 65 | 
 66 |     def __call__(
 67 |         self,
 68 |         prompt: str,
 69 |         image: PIL.Image.Image | List[PIL.Image.Image] | None,
 70 |         block: Block | None,
 71 |         response_schema: type[BaseModel],
 72 |         max_retries: int | None = None,
 73 |         timeout: int | None = None,
 74 |     ):
 75 |         if max_retries is None:
 76 |             max_retries = self.max_retries
 77 | 
 78 |         if timeout is None:
 79 |             timeout = self.timeout
 80 | 
 81 |         schema_example = response_schema.model_json_schema()
 82 |         system_prompt = f"""
 83 | Follow the instructions given by the user prompt.  You must provide your response in JSON format matching this schema:
 84 | 
 85 | {json.dumps(schema_example, indent=2)}
 86 | 
 87 | Respond only with the JSON schema, nothing else.  Do not include ```json, ```,  or any other formatting.
 88 | """.strip()
 89 | 
 90 |         client = self.get_client()
 91 |         image_data = self.format_image_for_llm(image)
 92 | 
 93 |         messages = [
 94 |             {
 95 |                 "role": "user",
 96 |                 "content": [
 97 |                     *image_data,
 98 |                     {"type": "text", "text": prompt},
 99 |                 ],
100 |             }
101 |         ]
102 | 
103 |         total_tries = max_retries + 1
104 |         for tries in range(1, total_tries + 1):
105 |             try:
106 |                 response = client.messages.create(
107 |                     system=system_prompt,
108 |                     model=self.claude_model_name,
109 |                     max_tokens=self.max_claude_tokens,
110 |                     messages=messages,
111 |                     timeout=timeout,
112 |                 )
113 |                 # Extract and validate response
114 |                 response_text = response.content[0].text
115 |                 return self.validate_response(response_text, response_schema)
116 |             except (RateLimitError, APITimeoutError) as e:
117 |                 # Rate limit exceeded
118 |                 if tries == total_tries:
119 |                     # Last attempt failed. Give up
120 |                     logger.error(
121 |                         f"Rate limit error: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})",
122 |                     )
123 |                     break
124 |                 else:
125 |                     wait_time = tries * self.retry_wait_time
126 |                     logger.warning(
127 |                         f"Rate limit error: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})",
128 |                     )
129 |                     time.sleep(wait_time)
130 |             except Exception as e:
131 |                 logger.error(f"Error during Claude API call: {e}")
132 |                 break
133 | 
134 |         return {}
135 | 
```

--------------------------------------------------------------------------------
/CLA.md:
--------------------------------------------------------------------------------

```markdown
 1 | Marker Contributor Agreement
 2 | 
 3 | This Marker Contributor Agreement ("MCA") applies to any contribution that you make to any product or project managed by us (the "project"), and sets out the intellectual property rights you grant to us in the contributed materials. The term "us" shall mean Endless Labs, Inc. The term "you" shall mean the person or entity identified below. 
 4 | 
 5 | If you agree to be bound by these terms, sign by writing "I have read the CLA document and I hereby sign the CLA" in response to the CLA bot Github comment. Read this agreement carefully before signing. These terms and conditions constitute a binding legal agreement.
 6 | 
 7 | 1. The term 'contribution' or 'contributed materials' means any source code, object code, patch, tool, sample, graphic, specification, manual, documentation, or any other material posted or submitted by you to the project. 
 8 | 2. With respect to any worldwide copyrights, or copyright applications and registrations, in your contribution: 
 9 |    - you hereby assign to us joint ownership, and to the extent that such assignment is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty free, unrestricted license to exercise all rights under those copyrights. This includes, at our option, the right to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements, including dual-license structures for commercial customers; 
10 |    - you agree that each of us can do all things in relation to your contribution as if each of us were the sole owners, and if one of us makes a derivative work of your contribution, the one who makes the derivative work (or has it made will be the sole owner of that derivative work; 
11 |    - you agree that you will not assert any moral rights in your contribution against us, our licensees or transferees; 
12 |    - you agree that we may register a copyright in your contribution and exercise all ownership rights associated with it; and 
13 |    - you agree that neither of us has any duty to consult with, obtain the consent of, pay or render an accounting to the other for any use or distribution of vour contribution. 
14 | 3. With respect to any patents you own, or that you can license without payment to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty-free license to:
15 |    - make, have made, use, sell, offer to sell, import, and otherwise transfer your contribution in whole or in part, alone or in combination with or included in any product, work or materials arising out of the project to which your contribution was submitted, and
16 |    - at our option, to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements. 
17 | If you or your affiliates institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the contribution or any project it was submitted to constitutes direct or contributory patent infringement, then any patent licenses granted to you under this agreement for that contribution shall terminate as of the date such litigation is filed.
18 | 4. Except as set out above, you keep all right, title, and interest in your contribution. The rights that you grant to us under these terms are effective on the date you first submitted a contribution to us, even if your submission took place before the date you sign these terms. Any contribution we make available under any license will also be made available under a suitable FSF (Free Software Foundation) or OSI (Open Source Initiative) approved license. 
19 | 5. You covenant, represent, warrant and agree that: 
20 |    - each contribution that you submit is and shall be an original work of authorship and you can legally grant the rights set out in this MCA; 
21 |    - to the best of your knowledge, each contribution will not violate any third party's copyrights, trademarks, patents, or other intellectual property rights; and 
22 |    - each contribution shall be in compliance with U.S. export control laws and other applicable export and import laws.
23 | You agree to notify us if you become aware of any circumstance which would make any of the foregoing representations inaccurate in any respect. Endless Labs, Inc. may publicly disclose your participation in the project, including the fact that you have signed the MCA. 
24 | 6. This MCA is governed by the laws of the State of California and applicable U.S. Federal law. Any choice of law rules will not apply.
```

--------------------------------------------------------------------------------
/marker/schema/text/line.py:
--------------------------------------------------------------------------------

```python
  1 | import html
  2 | import re
  3 | from typing import Literal, List
  4 | 
  5 | import regex
  6 | 
  7 | from marker.schema import BlockTypes
  8 | from marker.schema.blocks import Block, BlockOutput
  9 | 
 10 | HYPHENS = r"-—¬"
 11 | 
 12 | 
 13 | def remove_tags(text):
 14 |     return re.sub(r"<[^>]+>", "", text)
 15 | 
 16 | 
 17 | def replace_last(string, old, new):
 18 |     matches = list(re.finditer(old, string))
 19 |     if not matches:
 20 |         return string
 21 |     last_match = matches[-1]
 22 |     return string[: last_match.start()] + new + string[last_match.end() :]
 23 | 
 24 | 
 25 | def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str:
 26 |     lowercase_letters = r"\p{Ll}"
 27 | 
 28 |     hyphen_regex = regex.compile(rf".*[{HYPHENS}]\s?$", regex.DOTALL)
 29 |     next_line_starts_lowercase = regex.match(
 30 |         rf"^\s?[{lowercase_letters}]", next_line_text
 31 |     )
 32 | 
 33 |     if hyphen_regex.match(line_text) and next_line_starts_lowercase:
 34 |         line_html = replace_last(line_html, rf"[{HYPHENS}]", "")
 35 | 
 36 |     return line_html
 37 | 
 38 | 
 39 | class Line(Block):
 40 |     block_type: BlockTypes = BlockTypes.Line
 41 |     block_description: str = "A line of text."
 42 |     formats: List[Literal["math"]] | None = (
 43 |         None  # Sometimes we want to set math format at the line level, not span
 44 |     )
 45 | 
 46 |     def ocr_input_text(self, document):
 47 |         text = ""
 48 |         for block in self.contained_blocks(document, (BlockTypes.Span,)):
 49 |             # We don't include superscripts/subscripts and math since they can be unreliable at this stage
 50 |             block_text = block.text
 51 |             if block.italic:
 52 |                 text += f"<i>{block_text}</i>"
 53 |             elif block.bold:
 54 |                 text += f"<b>{block_text}</b>"
 55 |             else:
 56 |                 text += block_text
 57 | 
 58 |         return text.strip()
 59 | 
 60 |     def formatted_text(self, document, skip_urls=False):
 61 |         text = ""
 62 |         for block in self.contained_blocks(document, (BlockTypes.Span,)):
 63 |             block_text = html.escape(block.text)
 64 | 
 65 |             if block.has_superscript:
 66 |                 block_text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", block_text)
 67 |                 if "<sup>" not in block_text:
 68 |                     block_text = f"<sup>{block_text}</sup>"
 69 | 
 70 |             if block.url and not skip_urls:
 71 |                 block_text = f"<a href='{block.url}'>{block_text}</a>"
 72 | 
 73 |             if block.italic:
 74 |                 text += f"<i>{block_text}</i>"
 75 |             elif block.bold:
 76 |                 text += f"<b>{block_text}</b>"
 77 |             elif block.math:
 78 |                 text += f"<math display='inline'>{block_text}</math>"
 79 |             else:
 80 |                 text += block_text
 81 | 
 82 |         return text
 83 | 
 84 |     def assemble_html(self, document, child_blocks, parent_structure, block_config):
 85 |         template = ""
 86 |         for c in child_blocks:
 87 |             template += c.html
 88 | 
 89 |         raw_text = remove_tags(template).strip()
 90 |         structure_idx = parent_structure.index(self.id)
 91 |         if structure_idx < len(parent_structure) - 1:
 92 |             next_block_id = parent_structure[structure_idx + 1]
 93 |             next_line = document.get_block(next_block_id)
 94 |             next_line_raw_text = next_line.raw_text(document)
 95 |             template = strip_trailing_hyphens(raw_text, next_line_raw_text, template)
 96 |         else:
 97 |             template = template.strip(
 98 |                 " "
 99 |             )  # strip any trailing whitespace from the last line
100 |         return template
101 | 
102 |     def render(
103 |         self, document, parent_structure, section_hierarchy=None, block_config=None
104 |     ):
105 |         child_content = []
106 |         if self.structure is not None and len(self.structure) > 0:
107 |             for block_id in self.structure:
108 |                 block = document.get_block(block_id)
109 |                 child_content.append(
110 |                     block.render(
111 |                         document, parent_structure, section_hierarchy, block_config
112 |                     )
113 |                 )
114 | 
115 |         return BlockOutput(
116 |             html=self.assemble_html(
117 |                 document, child_content, parent_structure, block_config
118 |             ),
119 |             polygon=self.polygon,
120 |             id=self.id,
121 |             children=[],
122 |             section_hierarchy=section_hierarchy,
123 |         )
124 | 
125 |     def merge(self, other: "Line"):
126 |         self.polygon = self.polygon.merge([other.polygon])
127 | 
128 |         # Handle merging structure with Nones
129 |         if self.structure is None:
130 |             self.structure = other.structure
131 |         elif other.structure is not None:
132 |             self.structure = self.structure + other.structure
133 | 
134 |         # Merge formats with Nones
135 |         if self.formats is None:
136 |             self.formats = other.formats
137 |         elif other.formats is not None:
138 |             self.formats = list(set(self.formats + other.formats))
139 | 
```

--------------------------------------------------------------------------------
/marker/processors/list.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import Annotated, List, Tuple
  2 | 
  3 | from marker.processors import BaseProcessor
  4 | from marker.schema import BlockTypes
  5 | from marker.schema.blocks import ListItem
  6 | from marker.schema.document import Document
  7 | 
  8 | 
  9 | class ListProcessor(BaseProcessor):
 10 |     """
 11 |     A processor for merging lists across pages and columns
 12 |     """
 13 |     block_types = (BlockTypes.ListGroup,)
 14 |     ignored_block_types: Annotated[
 15 |         Tuple[BlockTypes],
 16 |         "The list of block types to ignore when merging lists.",
 17 |     ] = (BlockTypes.PageHeader, BlockTypes.PageFooter)
 18 |     min_x_indent: Annotated[
 19 |         float, "The minimum horizontal indentation required to consider a block as a nested list item.",
 20 |         "This is expressed as a percentage of the page width and is used to determine hierarchical relationships within a list.",
 21 |     ] = 0.01
 22 | 
 23 |     def __init__(self, config):
 24 |         super().__init__(config)
 25 | 
 26 |     def __call__(self, document: Document):
 27 |         self.list_group_continuation(document)
 28 |         self.list_group_indentation(document)
 29 | 
 30 |     def list_group_continuation(self, document: Document):
 31 |         for page in document.pages:
 32 |             for block in page.contained_blocks(document, self.block_types):
 33 |                 next_block = document.get_next_block(block, self.ignored_block_types)
 34 |                 if next_block is None:
 35 |                     continue
 36 |                 if next_block.block_type not in self.block_types:
 37 |                     continue
 38 |                 if next_block.structure is None:
 39 |                     continue
 40 |                 if next_block.ignore_for_output:
 41 |                     continue
 42 | 
 43 |                 column_break, page_break = False, False
 44 |                 next_block_in_first_quadrant = False
 45 | 
 46 |                 if next_block.page_id == block.page_id:  # block on the same page
 47 |                     # we check for a column break
 48 |                     column_break = next_block.polygon.y_start <= block.polygon.y_end
 49 |                 else:
 50 |                     page_break = True
 51 |                     next_page = document.get_page(next_block.page_id)
 52 |                     next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
 53 |                         (next_block.polygon.y_start < next_page.polygon.height // 2)
 54 | 
 55 |                 block.has_continuation = column_break or (page_break and next_block_in_first_quadrant)
 56 | 
 57 |     def list_group_indentation(self, document: Document):
 58 |         for page in document.pages:
 59 |             for block in page.contained_blocks(document, self.block_types):
 60 |                 if block.structure is None:
 61 |                     continue
 62 |                 if block.ignore_for_output:
 63 |                     continue
 64 | 
 65 |                 stack: List[ListItem] = [block.get_next_block(page, None)]
 66 |                 for list_item_id in block.structure:
 67 |                     list_item_block: ListItem = page.get_block(list_item_id)
 68 | 
 69 |                     # This can be a line sometimes
 70 |                     if list_item_block.block_type != BlockTypes.ListItem:
 71 |                         continue
 72 | 
 73 |                     while stack and list_item_block.polygon.x_start <= stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
 74 |                         stack.pop()
 75 | 
 76 |                     if stack and list_item_block.polygon.y_start > stack[-1].polygon.y_start:
 77 |                         list_item_block.list_indent_level = stack[-1].list_indent_level
 78 |                         if list_item_block.polygon.x_start > stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
 79 |                             list_item_block.list_indent_level += 1
 80 | 
 81 |                     next_list_item_block = block.get_next_block(page, list_item_block)
 82 |                     if next_list_item_block is not None and next_list_item_block.polygon.x_start > list_item_block.polygon.x_end:
 83 |                         stack = [next_list_item_block]  # reset stack on column breaks
 84 |                     else:
 85 |                         stack.append(list_item_block)
 86 | 
 87 |                 stack: List[ListItem] = [block.get_next_block(page, None)]
 88 |                 for list_item_id in block.structure.copy():
 89 |                     list_item_block: ListItem = page.get_block(list_item_id)
 90 | 
 91 |                     while stack and list_item_block.list_indent_level <= stack[-1].list_indent_level:
 92 |                         stack.pop()
 93 | 
 94 |                     if stack:
 95 |                         current_parent = stack[-1]
 96 |                         current_parent.add_structure(list_item_block)
 97 |                         current_parent.polygon = current_parent.polygon.merge([list_item_block.polygon])
 98 | 
 99 |                         block.remove_structure_items([list_item_id])
100 |                     stack.append(list_item_block)
101 | 
```

--------------------------------------------------------------------------------
/marker/config/crawler.py:
--------------------------------------------------------------------------------

```python
  1 | import importlib
  2 | import inspect
  3 | import pkgutil
  4 | from functools import cached_property
  5 | from typing import Annotated, Dict, Set, Type, get_args, get_origin
  6 | 
  7 | from marker.builders import BaseBuilder
  8 | from marker.converters import BaseConverter
  9 | from marker.extractors import BaseExtractor
 10 | from marker.processors import BaseProcessor
 11 | from marker.providers import BaseProvider
 12 | from marker.renderers import BaseRenderer
 13 | from marker.services import BaseService
 14 | 
 15 | 
 16 | class ConfigCrawler:
 17 |     def __init__(
 18 |         self,
 19 |         base_classes=(
 20 |             BaseBuilder,
 21 |             BaseProcessor,
 22 |             BaseConverter,
 23 |             BaseProvider,
 24 |             BaseRenderer,
 25 |             BaseService,
 26 |             BaseExtractor,
 27 |         ),
 28 |     ):
 29 |         self.base_classes = base_classes
 30 |         self.class_config_map: Dict[str, dict] = {}
 31 | 
 32 |         self._crawl_config()
 33 | 
 34 |     def _crawl_config(self):
 35 |         for base in self.base_classes:
 36 |             base_class_type = base.__name__.removeprefix("Base")
 37 |             self.class_config_map.setdefault(base_class_type, {})
 38 |             for class_name, class_type in self._find_subclasses(base).items():
 39 |                 if class_name.startswith("Base"):
 40 |                     continue
 41 | 
 42 |                 self.class_config_map[base_class_type].setdefault(
 43 |                     class_name, {"class_type": class_type, "config": {}}
 44 |                 )
 45 |                 for attr, attr_type in self._gather_super_annotations(
 46 |                     class_type
 47 |                 ).items():
 48 |                     default = getattr(class_type, attr)
 49 |                     metadata = (f"Default is {default}.",)
 50 | 
 51 |                     if get_origin(attr_type) is Annotated:
 52 |                         if any("Default" in desc for desc in attr_type.__metadata__):
 53 |                             metadata = attr_type.__metadata__
 54 |                         else:
 55 |                             metadata = attr_type.__metadata__ + metadata
 56 |                         attr_type = get_args(attr_type)[0]
 57 | 
 58 |                     formatted_type = self._format_type(attr_type)
 59 |                     self.class_config_map[base_class_type][class_name]["config"][
 60 |                         attr
 61 |                     ] = (attr_type, formatted_type, default, metadata)
 62 | 
 63 |     @staticmethod
 64 |     def _gather_super_annotations(cls: Type) -> Dict[str, Type]:
 65 |         """
 66 |         Collect all annotated attributes from `cls` and its superclasses, bottom-up.
 67 |         Subclass attributes overwrite superclass attributes with the same name.
 68 |         """
 69 |         # We'll walk the MRO from base -> derived so subclass attributes overwrite
 70 |         # the same attribute name from superclasses.
 71 |         annotations = {}
 72 |         for base in reversed(cls.__mro__):
 73 |             if base is object:
 74 |                 continue
 75 |             if hasattr(base, "__annotations__"):
 76 |                 for name, annotation in base.__annotations__.items():
 77 |                     annotations[name] = annotation
 78 |         return annotations
 79 | 
 80 |     @cached_property
 81 |     def attr_counts(self) -> Dict[str, int]:
 82 |         counts: Dict[str, int] = {}
 83 |         for base_type_dict in self.class_config_map.values():
 84 |             for class_map in base_type_dict.values():
 85 |                 for attr in class_map["config"].keys():
 86 |                     counts[attr] = counts.get(attr, 0) + 1
 87 |         return counts
 88 | 
 89 |     @cached_property
 90 |     def attr_set(self) -> Set[str]:
 91 |         attr_set: Set[str] = set()
 92 |         for base_type_dict in self.class_config_map.values():
 93 |             for class_name, class_map in base_type_dict.items():
 94 |                 for attr in class_map["config"].keys():
 95 |                     attr_set.add(attr)
 96 |                     attr_set.add(f"{class_name}_{attr}")
 97 |         return attr_set
 98 | 
 99 |     def _find_subclasses(self, base_class):
100 |         subclasses = {}
101 |         module_name = base_class.__module__
102 |         package = importlib.import_module(module_name)
103 |         if hasattr(package, "__path__"):
104 |             for _, module_name, _ in pkgutil.walk_packages(
105 |                 package.__path__, module_name + "."
106 |             ):
107 |                 try:
108 |                     module = importlib.import_module(module_name)
109 |                     for name, obj in inspect.getmembers(module, inspect.isclass):
110 |                         if issubclass(obj, base_class) and obj is not base_class:
111 |                             subclasses[name] = obj
112 |                 except ImportError:
113 |                     pass
114 |         return subclasses
115 | 
116 |     def _format_type(self, t: Type) -> str:
117 |         """Format a typing type like Optional[int] into a readable string."""
118 | 
119 |         if get_origin(t):  # Handle Optional and types with origins separately
120 |             return f"{t}".removeprefix("typing.")
121 |         else:  # Regular types like int, str
122 |             return t.__name__
123 | 
124 | 
125 | crawler = ConfigCrawler()
126 | 
```

--------------------------------------------------------------------------------
/marker/processors/line_merge.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import Annotated, List
  2 | 
  3 | from marker.processors import BaseProcessor
  4 | from marker.schema import BlockTypes
  5 | from marker.schema.blocks import Block
  6 | from marker.schema.document import Document
  7 | from marker.schema.text import Line
  8 | from marker.util import matrix_intersection_area
  9 | 
 10 | 
 11 | class LineMergeProcessor(BaseProcessor):
 12 |     """
 13 |     A processor for merging inline math lines.
 14 |     """
 15 |     block_types = (BlockTypes.Text, BlockTypes.TextInlineMath, BlockTypes.Caption, BlockTypes.Footnote, BlockTypes.SectionHeader)
 16 |     min_merge_pct: Annotated[
 17 |         float,
 18 |         "The minimum percentage of intersection area to consider merging."
 19 |     ] = .015
 20 |     block_expand_threshold: Annotated[
 21 |         float,
 22 |         "The percentage of the block width to expand the bounding box."
 23 |     ] = .05
 24 |     min_merge_ydist: Annotated[
 25 |         float,
 26 |         "The minimum y distance between lines to consider merging."
 27 |     ] = 5
 28 |     intersection_pct_threshold: Annotated[
 29 |         float,
 30 |         "The total amount of intersection area concentrated in the max intersection block."
 31 |     ] = .5
 32 |     vertical_overlap_pct_threshold: Annotated[
 33 |         float,
 34 |         "The minimum percentage of vertical overlap to consider merging."
 35 |     ] = .8
 36 |     use_llm: Annotated[
 37 |         bool,
 38 |         "Whether to use LLMs to improve accuracy."
 39 |     ] = False
 40 | 
 41 |     def __init__(self, config):
 42 |         super().__init__(config)
 43 | 
 44 |     def merge_lines(self, lines: List[Line], block: Block):
 45 |         lines = [l for l in lines if l.polygon.width * 5 > l.polygon.height]  # Skip vertical lines
 46 |         line_bboxes = [l.polygon.expand(self.block_expand_threshold, 0).bbox for l in lines]  # Expand horizontally
 47 |         intersections = matrix_intersection_area(line_bboxes, line_bboxes)
 48 | 
 49 |         merges = []
 50 |         merge = []
 51 |         for i in range(len(line_bboxes)):
 52 |             intersection_row = intersections[i]
 53 |             intersection_row[i] = 0  # Zero out the current idx
 54 | 
 55 |             if i < len(line_bboxes) - 1:
 56 |                 intersection_row[i+1] = 0 # Zero out the next idx, so we only evaluate merge from the left
 57 | 
 58 |             if len(merge) == 0:
 59 |                 merge.append(i)
 60 |                 continue
 61 | 
 62 |             # Zero out previous merge segments
 63 |             merge_intersection = sum([intersection_row[m] for m in merge])
 64 |             line_area = lines[i].polygon.area
 65 |             intersection_pct = merge_intersection / max(1, line_area)
 66 | 
 67 |             total_intersection = max(1, sum(intersection_row))
 68 | 
 69 |             line_start = lines[merge[0]].polygon.y_start
 70 |             line_end = lines[merge[0]].polygon.y_end
 71 | 
 72 |             vertical_overlap_start = max(line_start, lines[i].polygon.y_start)
 73 |             vertical_overlap_end = min(line_end, lines[i].polygon.y_end)
 74 |             vertical_overlap = max(0, vertical_overlap_end - vertical_overlap_start)
 75 |             vertical_overlap_pct = vertical_overlap / max(1, lines[i].polygon.height)
 76 | 
 77 |             if all([
 78 |                 # Overlaps enough
 79 |                 intersection_pct >= self.min_merge_pct,
 80 |                 # Within same line
 81 |                 vertical_overlap_pct > self.vertical_overlap_pct_threshold,
 82 |                 # doesn't overlap with anything else
 83 |                 merge_intersection / total_intersection > self.intersection_pct_threshold
 84 |             ]):
 85 |                 merge.append(i)
 86 |             else:
 87 |                 merges.append(merge)
 88 |                 merge = []
 89 | 
 90 |         if merge:
 91 |             merges.append(merge)
 92 | 
 93 |         merges = [m for m in merges if len(m) > 1]
 94 |         merged = set()
 95 |         for merge in merges:
 96 |             merge = [m for m in merge if m not in merged]
 97 |             if len(merge) < 2:
 98 |                 continue
 99 | 
100 |             line: Line = lines[merge[0]]
101 |             merged.add(merge[0])
102 |             for idx in merge[1:]:
103 |                 other_line: Line = lines[idx]
104 |                 line.merge(other_line)
105 |                 block.structure.remove(other_line.id)
106 |                 other_line.removed = True  # Mark line as removed
107 |                 merged.add(idx)
108 | 
109 |             # It is probably math if we are merging provider lines like this
110 |             if not line.formats:
111 |                 line.formats = ["math"]
112 |             elif "math" not in line.formats:
113 |                 line.formats.append("math")
114 | 
115 | 
116 |     def __call__(self, document: Document):
117 |         # Merging lines only needed for inline math
118 |         if not self.use_llm:
119 |             return
120 | 
121 |         for page in document.pages:
122 |             for block in page.contained_blocks(document, self.block_types):
123 |                 if block.structure is None:
124 |                     continue
125 | 
126 |                 if not len(block.structure) >= 2:  # Skip single lines
127 |                     continue
128 | 
129 |                 lines = block.contained_blocks(document, (BlockTypes.Line,))
130 |                 self.merge_lines(lines, block)
131 | 
```

--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------

```python
  1 | import tempfile
  2 | from typing import Dict, Type
  3 | 
  4 | from PIL import Image, ImageDraw
  5 | 
  6 | import datasets
  7 | import pytest
  8 | 
  9 | from marker.builders.document import DocumentBuilder
 10 | from marker.builders.layout import LayoutBuilder
 11 | from marker.builders.line import LineBuilder
 12 | from marker.builders.ocr import OcrBuilder
 13 | from marker.builders.structure import StructureBuilder
 14 | from marker.converters.pdf import PdfConverter
 15 | from marker.models import create_model_dict
 16 | from marker.providers.registry import provider_from_filepath
 17 | from marker.renderers.chunk import ChunkRenderer
 18 | from marker.renderers.html import HTMLRenderer
 19 | from marker.schema import BlockTypes
 20 | from marker.schema.blocks import Block
 21 | from marker.renderers.markdown import MarkdownRenderer
 22 | from marker.renderers.json import JSONRenderer
 23 | from marker.schema.registry import register_block_class
 24 | from marker.util import classes_to_strings, strings_to_classes
 25 | 
 26 | 
 27 | @pytest.fixture(scope="session")
 28 | def model_dict():
 29 |     model_dict = create_model_dict()
 30 |     yield model_dict
 31 |     del model_dict
 32 | 
 33 | 
 34 | @pytest.fixture(scope="session")
 35 | def layout_model(model_dict):
 36 |     yield model_dict["layout_model"]
 37 | 
 38 | 
 39 | @pytest.fixture(scope="session")
 40 | def detection_model(model_dict):
 41 |     yield model_dict["detection_model"]
 42 | 
 43 | 
 44 | @pytest.fixture(scope="session")
 45 | def recognition_model(model_dict):
 46 |     yield model_dict["recognition_model"]
 47 | 
 48 | 
 49 | @pytest.fixture(scope="session")
 50 | def table_rec_model(model_dict):
 51 |     yield model_dict["table_rec_model"]
 52 | 
 53 | 
 54 | @pytest.fixture(scope="session")
 55 | def ocr_error_model(model_dict):
 56 |     yield model_dict["ocr_error_model"]
 57 | 
 58 | 
 59 | @pytest.fixture(scope="function")
 60 | def config(request):
 61 |     config_mark = request.node.get_closest_marker("config")
 62 |     config = config_mark.args[0] if config_mark else {}
 63 | 
 64 |     override_map: Dict[BlockTypes, Type[Block]] = config.get("override_map", {})
 65 |     for block_type, override_block_type in override_map.items():
 66 |         register_block_class(block_type, override_block_type)
 67 | 
 68 |     return config
 69 | 
 70 | 
 71 | @pytest.fixture(scope="session")
 72 | def pdf_dataset():
 73 |     return datasets.load_dataset("datalab-to/pdfs", split="train")
 74 | 
 75 | 
 76 | @pytest.fixture(scope="function")
 77 | def temp_doc(request, pdf_dataset):
 78 |     filename_mark = request.node.get_closest_marker("filename")
 79 |     filename = filename_mark.args[0] if filename_mark else "adversarial.pdf"
 80 | 
 81 |     idx = pdf_dataset["filename"].index(filename)
 82 |     suffix = filename.split(".")[-1]
 83 | 
 84 |     temp_pdf = tempfile.NamedTemporaryFile(suffix=f".{suffix}")
 85 |     temp_pdf.write(pdf_dataset["pdf"][idx])
 86 |     temp_pdf.flush()
 87 |     yield temp_pdf
 88 | 
 89 | 
 90 | @pytest.fixture(scope="function")
 91 | def doc_provider(request, config, temp_doc):
 92 |     provider_cls = provider_from_filepath(temp_doc.name)
 93 |     yield provider_cls(temp_doc.name, config)
 94 | 
 95 | 
 96 | @pytest.fixture(scope="function")
 97 | def pdf_document(
 98 |     request,
 99 |     config,
100 |     doc_provider,
101 |     layout_model,
102 |     ocr_error_model,
103 |     recognition_model,
104 |     detection_model,
105 | ):
106 |     layout_builder = LayoutBuilder(layout_model, config)
107 |     line_builder = LineBuilder(detection_model, ocr_error_model, config)
108 |     ocr_builder = OcrBuilder(recognition_model, config)
109 |     builder = DocumentBuilder(config)
110 |     structure_builder = StructureBuilder(config)
111 |     document = builder(doc_provider, layout_builder, line_builder, ocr_builder)
112 |     structure_builder(document)
113 |     yield document
114 | 
115 | 
116 | @pytest.fixture(scope="function")
117 | def pdf_converter(request, config, model_dict, renderer, llm_service):
118 |     if llm_service:
119 |         llm_service = classes_to_strings([llm_service])[0]
120 |     yield PdfConverter(
121 |         artifact_dict=model_dict,
122 |         processor_list=None,
123 |         renderer=classes_to_strings([renderer])[0],
124 |         config=config,
125 |         llm_service=llm_service,
126 |     )
127 | 
128 | 
129 | @pytest.fixture(scope="function")
130 | def renderer(request, config):
131 |     if request.node.get_closest_marker("output_format"):
132 |         output_format = request.node.get_closest_marker("output_format").args[0]
133 |         if output_format == "markdown":
134 |             return MarkdownRenderer
135 |         elif output_format == "json":
136 |             return JSONRenderer
137 |         elif output_format == "html":
138 |             return HTMLRenderer
139 |         elif output_format == "chunks":
140 |             return ChunkRenderer
141 |         else:
142 |             raise ValueError(f"Unknown output format: {output_format}")
143 |     else:
144 |         return MarkdownRenderer
145 | 
146 | 
147 | @pytest.fixture(scope="function")
148 | def llm_service(request, config):
149 |     llm_service = config.get("llm_service")
150 |     if not llm_service:
151 |         yield None
152 |     else:
153 |         yield strings_to_classes([llm_service])[0]
154 | 
155 | 
156 | @pytest.fixture(scope="function")
157 | def temp_image():
158 |     img = Image.new("RGB", (512, 512), color="white")
159 |     draw = ImageDraw.Draw(img)
160 |     draw.text((200, 200), "Hello, World!", fill="black", font_size=36)
161 |     with tempfile.NamedTemporaryFile(suffix=".png") as f:
162 |         img.save(f.name)
163 |         f.flush()
164 |         yield f
165 | 
```

--------------------------------------------------------------------------------
/marker/extractors/document.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | 
  3 | from pydantic import BaseModel
  4 | from typing import Annotated, Optional, List
  5 | 
  6 | from marker.extractors import BaseExtractor
  7 | from marker.extractors.page import PageExtractionSchema
  8 | from marker.logger import get_logger
  9 | 
 10 | logger = get_logger()
 11 | 
 12 | 
 13 | class DocumentExtractionSchema(BaseModel):
 14 |     analysis: str
 15 |     document_json: str
 16 | 
 17 | 
 18 | class DocumentExtractor(BaseExtractor):
 19 |     """
 20 |     An extractor that combines data from across all pages.
 21 |     """
 22 | 
 23 |     page_schema: Annotated[
 24 |         str,
 25 |         "The JSON schema to be extracted from the page.",
 26 |     ] = ""
 27 | 
 28 |     page_extraction_prompt = """You are an expert document analyst who reads documents and pulls data out in JSON format. You will receive your detailed notes from all the pages of a document, and a JSON schema that we want to extract from the document. Your task is to extract all the information properly into the JSON schema.
 29 | 
 30 | Some notes:
 31 | - The schema may contain a single object to extract from the entire document, or an array of objects. 
 32 | - The schema may contain nested objects, arrays, and other complex structures.
 33 | 
 34 | Some guidelines:
 35 | - Some entities will span multiple pages, so make sure to consult your notes thoroughly.
 36 | - In the case of potential conflicting values, pull out the values you have the most confidence in, from your notes.
 37 | - If you cannot find a value for a field, leave it blank in the JSON.
 38 | 
 39 | **Instructions:**
 40 | 1. Analyze your provided notes.
 41 | 2. Analyze the JSON schema.
 42 | 3. Write a detailed analysis of the notes, and the associated values in the schema.  Make sure to reference which page each piece of information comes from.
 43 | 4. Write the output in the JSON schema format, ensuring all required fields are filled out.  Output only the json data, without any additional text or formatting.
 44 | 
 45 | **Example:**
 46 | Input:
 47 | 
 48 | Detailed Notes
 49 | Page 0
 50 | On this page, I see a table with car makes and sales. The makes are Honda and Toyota, with sales of 100 and 200 respectively. The color is not present in the table, so I will leave it blank in the JSON.  That information may be present on another page.  Some JSON snippets I may find useful later are:
 51 | ```json
 52 | {
 53 |     "make": "Honda",
 54 |     "sales": 100,
 55 | }
 56 | ```
 57 | ```json
 58 | {
 59 |     "make": "Toyota",
 60 |     "sales": 200,
 61 | }
 62 | ```
 63 | 
 64 | Honda is the first row in the table, and Toyota is the second row.  Make is the first column, and sales is the second.
 65 | 
 66 | Page 1
 67 | I see a table that contains 2 rows, and has a color header.  The first row has the color red, and the second row has the color blue.  Here are some useful snippets:
 68 | 
 69 | Schema
 70 | 
 71 | ```json
 72 | {'$defs': {'Cars': {'properties': {'make': {'title': 'Make', 'type': 'string'}, 'sales': {'title': 'Sales', 'type': 'integer'}, 'color': {'title': 'Color', 'type': 'string'}}, 'required': ['make', 'sales', 'color'], 'title': 'Cars', 'type': 'object'}}, 'properties': {'cars': {'items': {'$ref': '#/$defs/Cars'}, 'title': 'Cars', 'type': 'array'}}, 'required': ['cars'], 'title': 'CarsList', 'type': 'object'}
 73 | ```
 74 | 
 75 | Output:
 76 | 
 77 | Analysis: From the notes, it looks like the information I need is in a table that spans 2 pages.  The first page has the makes and sales, while the second page has the colors.  I will combine this information into the JSON schema.
 78 | JSON
 79 | 
 80 | {
 81 |     "cars": [
 82 |         {
 83 |             "make": "Honda",
 84 |             "sales": 100,
 85 |             "color": "red"
 86 |         },
 87 |         {
 88 |             "make": "Toyota",
 89 |             "sales": 200,
 90 |             "color": "blue"
 91 |         }
 92 |     ]
 93 | }
 94 | 
 95 | **Input:**
 96 | 
 97 | Detailed Notes
 98 | {{document_notes}}
 99 | 
100 | Schema
101 | ```json
102 | {{schema}}
103 | ```
104 | """
105 | 
106 |     def assemble_document_notes(self, page_notes: List[PageExtractionSchema]) -> str:
107 |         notes = ""
108 |         for i, page_schema in enumerate(page_notes):
109 |             if not page_notes:
110 |                 continue
111 |             notes += f"Page {i + 1}\n{page_schema.detailed_notes}\n\n"
112 |         return notes.strip()
113 | 
114 |     def __call__(
115 |         self,
116 |         page_notes: List[PageExtractionSchema],
117 |         **kwargs,
118 |     ) -> Optional[DocumentExtractionSchema]:
119 |         if not self.page_schema:
120 |             raise ValueError(
121 |                 "Page schema must be defined for structured extraction to work."
122 |             )
123 | 
124 |         prompt = self.page_extraction_prompt.replace(
125 |             "{{document_notes}}", self.assemble_document_notes(page_notes)
126 |         ).replace("{{schema}}", json.dumps(self.page_schema))
127 |         response = self.llm_service(prompt, None, None, DocumentExtractionSchema)
128 | 
129 |         logger.debug(f"Document extraction response: {response}")
130 | 
131 |         if not response or any(
132 |             [
133 |                 key not in response
134 |                 for key in [
135 |                     "analysis",
136 |                     "document_json",
137 |                 ]
138 |             ]
139 |         ):
140 |             return None
141 | 
142 |         json_data = response["document_json"].strip().lstrip("```json").rstrip("```")
143 | 
144 |         return DocumentExtractionSchema(
145 |             analysis=response["analysis"], document_json=json_data
146 |         )
147 | 
```

--------------------------------------------------------------------------------
/marker/scripts/server.py:
--------------------------------------------------------------------------------

```python
  1 | import traceback
  2 | 
  3 | import click
  4 | import os
  5 | 
  6 | from pydantic import BaseModel, Field
  7 | from starlette.responses import HTMLResponse
  8 | 
  9 | from marker.config.parser import ConfigParser
 10 | from marker.output import text_from_rendered
 11 | 
 12 | import base64
 13 | from contextlib import asynccontextmanager
 14 | from typing import Optional, Annotated
 15 | import io
 16 | 
 17 | from fastapi import FastAPI, Form, File, UploadFile
 18 | from marker.converters.pdf import PdfConverter
 19 | from marker.models import create_model_dict
 20 | from marker.settings import settings
 21 | 
 22 | app_data = {}
 23 | 
 24 | 
 25 | UPLOAD_DIRECTORY = "./uploads"
 26 | os.makedirs(UPLOAD_DIRECTORY, exist_ok=True)
 27 | 
 28 | 
 29 | @asynccontextmanager
 30 | async def lifespan(app: FastAPI):
 31 |     app_data["models"] = create_model_dict()
 32 | 
 33 |     yield
 34 | 
 35 |     if "models" in app_data:
 36 |         del app_data["models"]
 37 | 
 38 | 
 39 | app = FastAPI(lifespan=lifespan)
 40 | 
 41 | 
 42 | @app.get("/")
 43 | async def root():
 44 |     return HTMLResponse(
 45 |         """
 46 | <h1>Marker API</h1>
 47 | <ul>
 48 |     <li><a href="/docs">API Documentation</a></li>
 49 |     <li><a href="/marker">Run marker (post request only)</a></li>
 50 | </ul>
 51 | """
 52 |     )
 53 | 
 54 | 
 55 | class CommonParams(BaseModel):
 56 |     filepath: Annotated[
 57 |         Optional[str], Field(description="The path to the PDF file to convert.")
 58 |     ]
 59 |     page_range: Annotated[
 60 |         Optional[str],
 61 |         Field(
 62 |             description="Page range to convert, specify comma separated page numbers or ranges.  Example: 0,5-10,20",
 63 |             example=None,
 64 |         ),
 65 |     ] = None
 66 |     force_ocr: Annotated[
 67 |         bool,
 68 |         Field(
 69 |             description="Force OCR on all pages of the PDF.  Defaults to False.  This can lead to worse results if you have good text in your PDFs (which is true in most cases)."
 70 |         ),
 71 |     ] = False
 72 |     paginate_output: Annotated[
 73 |         bool,
 74 |         Field(
 75 |             description="Whether to paginate the output.  Defaults to False.  If set to True, each page of the output will be separated by a horizontal rule that contains the page number (2 newlines, {PAGE_NUMBER}, 48 - characters, 2 newlines)."
 76 |         ),
 77 |     ] = False
 78 |     output_format: Annotated[
 79 |         str,
 80 |         Field(
 81 |             description="The format to output the text in.  Can be 'markdown', 'json', or 'html'.  Defaults to 'markdown'."
 82 |         ),
 83 |     ] = "markdown"
 84 | 
 85 | 
 86 | async def _convert_pdf(params: CommonParams):
 87 |     assert params.output_format in ["markdown", "json", "html", "chunks"], (
 88 |         "Invalid output format"
 89 |     )
 90 |     try:
 91 |         options = params.model_dump()
 92 |         config_parser = ConfigParser(options)
 93 |         config_dict = config_parser.generate_config_dict()
 94 |         config_dict["pdftext_workers"] = 1
 95 |         converter_cls = PdfConverter
 96 |         converter = converter_cls(
 97 |             config=config_dict,
 98 |             artifact_dict=app_data["models"],
 99 |             processor_list=config_parser.get_processors(),
100 |             renderer=config_parser.get_renderer(),
101 |             llm_service=config_parser.get_llm_service(),
102 |         )
103 |         rendered = converter(params.filepath)
104 |         text, _, images = text_from_rendered(rendered)
105 |         metadata = rendered.metadata
106 |     except Exception as e:
107 |         traceback.print_exc()
108 |         return {
109 |             "success": False,
110 |             "error": str(e),
111 |         }
112 | 
113 |     encoded = {}
114 |     for k, v in images.items():
115 |         byte_stream = io.BytesIO()
116 |         v.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT)
117 |         encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(
118 |             settings.OUTPUT_ENCODING
119 |         )
120 | 
121 |     return {
122 |         "format": params.output_format,
123 |         "output": text,
124 |         "images": encoded,
125 |         "metadata": metadata,
126 |         "success": True,
127 |     }
128 | 
129 | 
130 | @app.post("/marker")
131 | async def convert_pdf(params: CommonParams):
132 |     return await _convert_pdf(params)
133 | 
134 | 
135 | @app.post("/marker/upload")
136 | async def convert_pdf_upload(
137 |     page_range: Optional[str] = Form(default=None),
138 |     force_ocr: Optional[bool] = Form(default=False),
139 |     paginate_output: Optional[bool] = Form(default=False),
140 |     output_format: Optional[str] = Form(default="markdown"),
141 |     file: UploadFile = File(
142 |         ..., description="The PDF file to convert.", media_type="application/pdf"
143 |     ),
144 | ):
145 |     upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
146 |     with open(upload_path, "wb+") as upload_file:
147 |         file_contents = await file.read()
148 |         upload_file.write(file_contents)
149 | 
150 |     params = CommonParams(
151 |         filepath=upload_path,
152 |         page_range=page_range,
153 |         force_ocr=force_ocr,
154 |         paginate_output=paginate_output,
155 |         output_format=output_format,
156 |     )
157 |     results = await _convert_pdf(params)
158 |     os.remove(upload_path)
159 |     return results
160 | 
161 | 
162 | @click.command()
163 | @click.option("--port", type=int, default=8000, help="Port to run the server on")
164 | @click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")
165 | def server_cli(port: int, host: str):
166 |     import uvicorn
167 | 
168 |     # Run the server
169 |     uvicorn.run(
170 |         app,
171 |         host=host,
172 |         port=port,
173 |     )
174 | 
```

--------------------------------------------------------------------------------
/marker/processors/equation.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import Annotated, List, Tuple
  2 | from PIL import Image
  3 | import re
  4 | from bs4 import BeautifulSoup
  5 | 
  6 | from ftfy import fix_text, TextFixerConfig
  7 | from surya.recognition import RecognitionPredictor, OCRResult
  8 | 
  9 | from marker.processors import BaseProcessor
 10 | from marker.schema import BlockTypes
 11 | from marker.schema.document import Document
 12 | from marker.settings import settings
 13 | 
 14 | MATH_TAG_PATTERN = re.compile(r"<math[^>]*>(.*?)</math>")
 15 | 
 16 | 
 17 | class EquationProcessor(BaseProcessor):
 18 |     """
 19 |     A processor for recognizing equations in the document.
 20 |     """
 21 | 
 22 |     block_types: Annotated[
 23 |         Tuple[BlockTypes],
 24 |         "The block types to process.",
 25 |     ] = (BlockTypes.Equation,)
 26 |     model_max_length: Annotated[
 27 |         int,
 28 |         "The maximum number of tokens to allow for the Recognition model.",
 29 |     ] = 1024
 30 |     equation_batch_size: Annotated[
 31 |         int,
 32 |         "The batch size to use for the recognition model while processing equations.",
 33 |         "Default is None, which will use the default batch size for the model.",
 34 |     ] = None
 35 |     disable_tqdm: Annotated[
 36 |         bool,
 37 |         "Whether to disable the tqdm progress bar.",
 38 |     ] = False
 39 |     drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False
 40 | 
 41 |     def __init__(self, recognition_model: RecognitionPredictor, config=None):
 42 |         super().__init__(config)
 43 | 
 44 |         self.recognition_model = recognition_model
 45 | 
 46 |     def get_batch_size(self):
 47 |         # Set to 1/4th of OCR batch size due to sequence length with tiling
 48 |         if self.equation_batch_size is not None:
 49 |             return self.equation_batch_size
 50 |         elif settings.TORCH_DEVICE_MODEL == "cuda":
 51 |             return 32
 52 |         elif settings.TORCH_DEVICE_MODEL == "mps":
 53 |             return 6
 54 |         return 6
 55 | 
 56 |     def __call__(self, document: Document):
 57 |         images = []
 58 |         equation_boxes = []
 59 |         equation_block_ids = []
 60 |         total_equation_blocks = 0
 61 | 
 62 |         for page in document.pages:
 63 |             page_image = page.get_image(highres=True)
 64 |             page_size = page.polygon.width, page.polygon.height
 65 |             image_size = page_image.size
 66 | 
 67 |             page_equation_boxes = []
 68 |             page_equation_block_ids = []
 69 |             equation_blocks = page.contained_blocks(document, self.block_types)
 70 |             for block in equation_blocks:
 71 |                 page_equation_boxes.append(
 72 |                     block.polygon.rescale(page_size, image_size).bbox
 73 |                 )
 74 |                 page_equation_block_ids.append(block.id)
 75 |                 total_equation_blocks += 1
 76 | 
 77 |             images.append(page_image)
 78 |             equation_boxes.append(page_equation_boxes)
 79 |             equation_block_ids.append(page_equation_block_ids)
 80 | 
 81 |         if total_equation_blocks == 0:
 82 |             return
 83 | 
 84 |         predictions = self.get_latex_batched(images, equation_boxes)
 85 |         for page_predictions, page_equation_block_ids in zip(
 86 |             predictions, equation_block_ids
 87 |         ):
 88 |             assert len(page_predictions) == len(page_equation_block_ids), (
 89 |                 "Every equation block should have a corresponding prediction"
 90 |             )
 91 |             for block_prediction, block_id in zip(
 92 |                 page_predictions, page_equation_block_ids
 93 |             ):
 94 |                 block = document.get_block(block_id)
 95 |                 block.html = self.fix_latex(block_prediction)
 96 | 
 97 |     def fix_latex(self, math_html: str):
 98 |         math_html = math_html.strip()
 99 |         soup = BeautifulSoup(math_html, "html.parser")
100 |         opening_math_tag = soup.find("math")
101 | 
102 |         # No math block found
103 |         if not opening_math_tag:
104 |             return ""
105 | 
106 |         # Force block format
107 |         opening_math_tag.attrs["display"] = "block"
108 |         fixed_math_html = str(soup)
109 | 
110 |         # Sometimes model outputs newlines at the beginning/end of tags
111 |         fixed_math_html = re.sub(
112 |             r"^<math display=\"block\">\\n(?![a-zA-Z])",
113 |             '<math display="block">',
114 |             fixed_math_html,
115 |         )
116 |         fixed_math_html = re.sub(r"\\n</math>$", "</math>", fixed_math_html)
117 |         fixed_math_html = re.sub(r"<br>", "", fixed_math_html)
118 |         fixed_math_html = fix_text(
119 |             fixed_math_html, config=TextFixerConfig(unescape_html=True)
120 |         )
121 |         return fixed_math_html
122 | 
123 |     def get_latex_batched(
124 |         self,
125 |         page_images: List[Image.Image],
126 |         bboxes: List[List[List[float]]],
127 |     ):
128 |         self.recognition_model.disable_tqdm = self.disable_tqdm
129 |         predictions: List[OCRResult] = self.recognition_model(
130 |             images=page_images,
131 |             bboxes=bboxes,
132 |             task_names=["ocr_with_boxes"] * len(page_images),
133 |             recognition_batch_size=self.get_batch_size(),
134 |             sort_lines=False,
135 |             drop_repeated_text=self.drop_repeated_text,
136 |             max_tokens=2048,
137 |             max_sliding_window=2148,
138 |         )
139 | 
140 |         equation_predictions = [
141 |             [line.text.strip() for line in page_prediction.text_lines]
142 |             for page_prediction in predictions
143 |         ]
144 | 
145 |         return equation_predictions
146 | 
```

--------------------------------------------------------------------------------
/marker/processors/llm/llm_equation.py:
--------------------------------------------------------------------------------

```python
  1 | from pydantic import BaseModel
  2 | 
  3 | from marker.processors.llm import BaseLLMSimpleBlockProcessor, PromptData, BlockData
  4 | from marker.schema import BlockTypes
  5 | from marker.schema.document import Document
  6 | 
  7 | from typing import Annotated, List
  8 | 
  9 | 
 10 | class LLMEquationProcessor(BaseLLMSimpleBlockProcessor):
 11 |     block_types = (BlockTypes.Equation,)
 12 |     min_equation_height: Annotated[
 13 |         float,
 14 |         "The minimum ratio between equation height and page height to consider for processing.",
 15 |      ] = 0.06
 16 |     image_expansion_ratio: Annotated[
 17 |         float,
 18 |         "The ratio to expand the image by when cropping.",
 19 |     ] = 0.05 # Equations sometimes get bboxes that are too tight
 20 |     redo_inline_math: Annotated[
 21 |         bool,
 22 |         "Whether to redo inline math blocks.",
 23 |     ] = False
 24 |     equation_latex_prompt: Annotated[
 25 |         str,
 26 |         "The prompt to use for generating LaTeX from equations.",
 27 |         "Default is a string containing the Gemini prompt."
 28 |     ] = r"""You're an expert mathematician who is good at writing LaTeX code and html for equations.
 29 | You'll receive an image of a math block, along with the text extracted from the block.  It may contain one or more equations. Your job is to write html that represents the content of the image, with the equations in LaTeX format.
 30 | 
 31 | Some guidelines:
 32 | - Output valid html, where all the equations can render properly.
 33 | - Use <math display="block"> as a block equation delimiter and <math> for inline equations.  Do not use $ or $$ as delimiters.
 34 | - Keep the LaTeX code inside the math tags simple, concise, and KaTeX compatible.
 35 | - Enclose all equations in the correct math tags. Use multiple math tags inside the html to represent multiple equations.
 36 | - Only use the html tags math, i, b, p, and br.
 37 | - Make sure to include all the equations in the image in the html output.
 38 | - Make sure to include other text in the image in the correct positions along with the equations.
 39 | 
 40 | **Instructions:**
 41 | 1. Carefully examine the provided image.
 42 | 2. Analyze the existing html, which may include LaTeX code.
 43 | 3. Write a short analysis of how the html should be corrected to represent the image.
 44 | 4. If the html and LaTeX are correct, write "No corrections needed."
 45 | 5. If the html and LaTeX are incorrect, generate the corrected html.
 46 | 6. Output only the analysis, then the corrected html or "No corrections needed."
 47 | **Example:**
 48 | Input:
 49 | ```html
 50 | The following equation illustrates the Pythagorean theorem:
 51 | x2 + y2 = z2
 52 | 
 53 | And this equation is a bit more complex:
 54 | (ab * x5 + x2 + 2 * x + 123)/t
 55 | ```
 56 | Output:
 57 | analysis: The equations are not formatted as LaTeX, or enclosed in math tags.
 58 | ```html
 59 | <p>The following equation illustrates the Pythagorean theorem:</p> 
 60 | <math display="block">x^{2} + y^{2} = z^{2}</math>
 61 | 
 62 | <p>And this equation is a bit more complex, and contains <math>ab \cdot x^{5}</math>:</p>
 63 | <math display="block">\frac{ab \cdot x^{5} + x^{2} + 2 \cdot x + 123}{t}</math>
 64 | ```
 65 | **Input:**
 66 | ```html
 67 | {equation}
 68 | ```
 69 | """
 70 | 
 71 |     def inference_blocks(self, document: Document) -> List[BlockData]:
 72 |         blocks = super().inference_blocks(document)
 73 |         out_blocks = []
 74 |         for block_data in blocks:
 75 |             block = block_data["block"]
 76 |             page = block_data["page"]
 77 | 
 78 |             # If we redo inline math, we redo all equations
 79 |             if all([
 80 |                 block.polygon.height / page.polygon.height < self.min_equation_height,
 81 |                 not self.redo_inline_math
 82 |             ]):
 83 |                 continue
 84 |             out_blocks.append(block_data)
 85 |         return out_blocks
 86 | 
 87 |     def block_prompts(self, document: Document) -> List[PromptData]:
 88 |         prompt_data = []
 89 |         for block_data in self.inference_blocks(document):
 90 |             block = block_data["block"]
 91 |             text = block.html if block.html else block.raw_text(document)
 92 |             prompt = self.equation_latex_prompt.replace("{equation}", text)
 93 |             image = self.extract_image(document, block)
 94 | 
 95 |             prompt_data.append({
 96 |                 "prompt": prompt,
 97 |                 "image": image,
 98 |                 "block": block,
 99 |                 "schema": EquationSchema,
100 |                 "page": block_data["page"]
101 |             })
102 | 
103 |         return prompt_data
104 | 
105 | 
106 |     def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
107 |         block = prompt_data["block"]
108 |         text = block.html if block.html else block.raw_text(document)
109 | 
110 |         if not response or "corrected_equation" not in response:
111 |             block.update_metadata(llm_error_count=1)
112 |             return
113 | 
114 |         html_equation = response["corrected_equation"]
115 | 
116 |         if "no corrections needed" in html_equation.lower():
117 |             return
118 | 
119 |         balanced_tags = html_equation.count("<math") == html_equation.count("</math>")
120 |         if not all([
121 |             html_equation,
122 |             balanced_tags,
123 |             len(html_equation) > len(text) * .3,
124 |         ]):
125 |             block.update_metadata(llm_error_count=1)
126 |             return
127 | 
128 |         block.html = html_equation
129 | 
130 | class EquationSchema(BaseModel):
131 |     analysis: str
132 |     corrected_equation: str
```

--------------------------------------------------------------------------------
/marker/builders/structure.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import Annotated
  2 | 
  3 | from marker.builders import BaseBuilder
  4 | from marker.schema import BlockTypes
  5 | from marker.schema.blocks import Text
  6 | from marker.schema.document import Document
  7 | from marker.schema.groups import ListGroup
  8 | from marker.schema.groups.page import PageGroup
  9 | from marker.schema.registry import get_block_class
 10 | 
 11 | 
 12 | class StructureBuilder(BaseBuilder):
 13 |     """
 14 |     A builder for grouping blocks together based on their structure.
 15 |     """
 16 |     gap_threshold: Annotated[
 17 |         float,
 18 |         "The minimum gap between blocks to consider them part of the same group.",
 19 |     ] = 0.05
 20 |     list_gap_threshold: Annotated[
 21 |         float,
 22 |         "The minimum gap between list items to consider them part of the same group.",
 23 |     ] = 0.1
 24 | 
 25 |     def __init__(self, config=None):
 26 |         super().__init__(config)
 27 | 
 28 |     def __call__(self, document: Document):
 29 |         for page in document.pages:
 30 |             self.group_caption_blocks(page)
 31 |             self.group_lists(page)
 32 |             self.unmark_lists(page)
 33 | 
 34 |     def group_caption_blocks(self, page: PageGroup):
 35 |         gap_threshold_px = self.gap_threshold * page.polygon.height
 36 |         static_page_structure = page.structure.copy()
 37 |         remove_ids = list()
 38 | 
 39 |         for i, block_id in enumerate(static_page_structure):
 40 |             block = page.get_block(block_id)
 41 |             if block.block_type not in [BlockTypes.Table, BlockTypes.Figure, BlockTypes.Picture]:
 42 |                 continue
 43 | 
 44 |             if block.id in remove_ids:
 45 |                 continue
 46 | 
 47 |             block_structure = [block_id]
 48 |             selected_polygons = [block.polygon]
 49 |             caption_types = [BlockTypes.Caption, BlockTypes.Footnote]
 50 | 
 51 |             prev_block = page.get_prev_block(block)
 52 |             next_block = page.get_next_block(block)
 53 | 
 54 |             if prev_block and \
 55 |                 prev_block.block_type in caption_types and \
 56 |                 prev_block.polygon.minimum_gap(block.polygon) < gap_threshold_px and \
 57 |                     prev_block.id not in remove_ids:
 58 |                 block_structure.insert(0, prev_block.id)
 59 |                 selected_polygons.append(prev_block.polygon)
 60 | 
 61 |             if next_block and \
 62 |                     next_block.block_type in caption_types and \
 63 |                     next_block.polygon.minimum_gap(block.polygon) < gap_threshold_px:
 64 |                 block_structure.append(next_block.id)
 65 |                 selected_polygons.append(next_block.polygon)
 66 | 
 67 |             if len(block_structure) > 1:
 68 |                 # Create a merged block
 69 |                 new_block_cls = get_block_class(BlockTypes[block.block_type.name + "Group"])
 70 |                 new_polygon = block.polygon.merge(selected_polygons)
 71 |                 group_block = page.add_block(new_block_cls, new_polygon)
 72 |                 group_block.structure = block_structure
 73 | 
 74 |                 # Update the structure of the page to reflect the new block
 75 |                 page.update_structure_item(block_id, group_block.id)
 76 |                 remove_ids.extend(block_structure)
 77 |         page.remove_structure_items(remove_ids)
 78 | 
 79 |     def group_lists(self, page: PageGroup):
 80 |         gap_threshold_px = self.list_gap_threshold * page.polygon.height
 81 |         static_page_structure = page.structure.copy()
 82 |         remove_ids = list()
 83 |         for i, block_id in enumerate(static_page_structure):
 84 |             block = page.get_block(block_id)
 85 |             if block.block_type not in [BlockTypes.ListItem]:
 86 |                 continue
 87 | 
 88 |             if block.id in remove_ids:
 89 |                 continue
 90 | 
 91 |             block_structure = [block_id]
 92 |             selected_polygons = [block.polygon]
 93 | 
 94 |             for j, next_block_id in enumerate(page.structure[i + 1:]):
 95 |                 next_block = page.get_block(next_block_id)
 96 |                 if all([
 97 |                     next_block.block_type == BlockTypes.ListItem,
 98 |                     next_block.polygon.minimum_gap(selected_polygons[-1]) < gap_threshold_px
 99 |                 ]):
100 |                     block_structure.append(next_block_id)
101 |                     selected_polygons.append(next_block.polygon)
102 |                 else:
103 |                     break
104 | 
105 |             if len(block_structure) > 1:
106 |                 new_polygon = block.polygon.merge(selected_polygons)
107 |                 group_block = page.add_block(ListGroup, new_polygon)
108 |                 group_block.structure = block_structure
109 | 
110 |                 # Update the structure of the page to reflect the new block
111 |                 page.update_structure_item(block_id, group_block.id)
112 |                 remove_ids.extend(block_structure)
113 | 
114 |         page.remove_structure_items(remove_ids)
115 | 
116 |     def unmark_lists(self, page: PageGroup):
117 |         # If lists aren't grouped, unmark them as list items
118 |         for block_id in page.structure:
119 |             block = page.get_block(block_id)
120 |             if block.block_type == BlockTypes.ListItem:
121 |                 generated_block = Text(
122 |                     polygon=block.polygon,
123 |                     page_id=block.page_id,
124 |                     structure=block.structure,
125 |                 )
126 |                 page.replace_block(block, generated_block)
127 | 
```

--------------------------------------------------------------------------------
/marker/services/gemini.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import time
  3 | import traceback
  4 | from io import BytesIO
  5 | from typing import List, Annotated
  6 | 
  7 | import PIL
  8 | from google import genai
  9 | from google.genai import types
 10 | from google.genai.errors import APIError
 11 | from marker.logger import get_logger
 12 | from pydantic import BaseModel
 13 | 
 14 | from marker.schema.blocks import Block
 15 | from marker.services import BaseService
 16 | 
 17 | logger = get_logger()
 18 | 
 19 | 
 20 | class BaseGeminiService(BaseService):
 21 |     gemini_model_name: Annotated[
 22 |         str, "The name of the Google model to use for the service."
 23 |     ] = "gemini-2.0-flash"
 24 |     thinking_budget: Annotated[
 25 |         int, "The thinking token budget to use for the service."
 26 |     ] = None
 27 | 
 28 |     def img_to_bytes(self, img: PIL.Image.Image):
 29 |         image_bytes = BytesIO()
 30 |         img.save(image_bytes, format="WEBP")
 31 |         return image_bytes.getvalue()
 32 | 
 33 |     def get_google_client(self, timeout: int):
 34 |         raise NotImplementedError
 35 | 
 36 |     def process_images(self, images):
 37 |         image_parts = [
 38 |             types.Part.from_bytes(data=self.img_to_bytes(img), mime_type="image/webp")
 39 |             for img in images
 40 |         ]
 41 |         return image_parts
 42 | 
 43 |     def __call__(
 44 |         self,
 45 |         prompt: str,
 46 |         image: PIL.Image.Image | List[PIL.Image.Image] | None,
 47 |         block: Block | None,
 48 |         response_schema: type[BaseModel],
 49 |         max_retries: int | None = None,
 50 |         timeout: int | None = None,
 51 |     ):
 52 |         if max_retries is None:
 53 |             max_retries = self.max_retries
 54 | 
 55 |         if timeout is None:
 56 |             timeout = self.timeout
 57 | 
 58 |         client = self.get_google_client(timeout=timeout)
 59 |         image_parts = self.format_image_for_llm(image)
 60 | 
 61 |         total_tries = max_retries + 1
 62 |         temperature = 0
 63 |         for tries in range(1, total_tries + 1):
 64 |             config = {
 65 |                 "temperature": temperature,
 66 |                 "response_schema": response_schema,
 67 |                 "response_mime_type": "application/json",
 68 |             }
 69 |             if self.max_output_tokens:
 70 |                 config["max_output_tokens"] = self.max_output_tokens
 71 | 
 72 |             if self.thinking_budget is not None:
 73 |                 # For gemini models, we can optionally set a thinking budget in the config
 74 |                 config["thinking_config"] = types.ThinkingConfig(
 75 |                     thinking_budget=self.thinking_budget
 76 |                 )
 77 | 
 78 |             try:
 79 |                 responses = client.models.generate_content(
 80 |                     model=self.gemini_model_name,
 81 |                     contents=image_parts
 82 |                     + [
 83 |                         prompt
 84 |                     ],  # According to gemini docs, it performs better if the image is the first element
 85 |                     config=config,
 86 |                 )
 87 |                 output = responses.candidates[0].content.parts[0].text
 88 |                 total_tokens = responses.usage_metadata.total_token_count
 89 |                 if block:
 90 |                     block.update_metadata(
 91 |                         llm_tokens_used=total_tokens, llm_request_count=1
 92 |                     )
 93 |                 return json.loads(output)
 94 |             except APIError as e:
 95 |                 if e.code in [429, 443, 503]:
 96 |                     # Rate limit exceeded
 97 |                     if tries == total_tries:
 98 |                         # Last attempt failed. Give up
 99 |                         logger.error(
100 |                             f"APIError: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})",
101 |                         )
102 |                         break
103 |                     else:
104 |                         wait_time = tries * self.retry_wait_time
105 |                         logger.warning(
106 |                             f"APIError: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})",
107 |                         )
108 |                         time.sleep(wait_time)
109 |                 else:
110 |                     logger.error(f"APIError: {e}")
111 |                     break
112 |             except json.JSONDecodeError as e:
113 |                 temperature = 0.2  # Increase temperature slightly to try and get a different respons
114 | 
115 |                 # The response was not valid JSON
116 |                 if tries == total_tries:
117 |                     # Last attempt failed. Give up
118 |                     logger.error(
119 |                         f"JSONDecodeError: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})",
120 |                     )
121 |                     break
122 |                 else:
123 |                     logger.warning(
124 |                         f"JSONDecodeError: {e}. Retrying... (Attempt {tries}/{total_tries})",
125 |                     )
126 |             except Exception as e:
127 |                 logger.error(f"Exception: {e}")
128 |                 traceback.print_exc()
129 |                 break
130 | 
131 |         return {}
132 | 
133 | 
134 | class GoogleGeminiService(BaseGeminiService):
135 |     gemini_api_key: Annotated[str, "The Google API key to use for the service."] = None
136 | 
137 |     def get_google_client(self, timeout: int):
138 |         return genai.Client(
139 |             api_key=self.gemini_api_key,
140 |             http_options={"timeout": timeout * 1000},  # Convert to milliseconds
141 |         )
142 | 
```

--------------------------------------------------------------------------------
/marker/processors/line_numbers.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import Annotated
  2 | 
  3 | from marker.processors import BaseProcessor
  4 | from marker.schema import BlockTypes
  5 | from marker.schema.document import Document
  6 | 
  7 | 
  8 | class LineNumbersProcessor(BaseProcessor):
  9 |     """
 10 |     A processor for ignoring line numbers.
 11 |     """
 12 |     block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
 13 |     strip_numbers_threshold: Annotated[
 14 |         float,
 15 |         "The fraction of lines or tokens in a block that must be numeric to consider them as line numbers.",
 16 |     ] = 0.6
 17 |     min_lines_in_block: Annotated[
 18 |         int,
 19 |         "The minimum number of lines required in a block for it to be considered during processing.",
 20 |         "Ensures that small blocks are ignored as they are unlikely to contain meaningful line numbers.",
 21 |     ] = 4
 22 |     min_line_length: Annotated[
 23 |         int,
 24 |         "The minimum length of a line (in characters) to consider it significant when checking for",
 25 |         "numeric prefixes or suffixes. Prevents false positives for short lines.",
 26 |     ] = 10
 27 |     min_line_number_span_ratio: Annotated[
 28 |         float,
 29 |         "The minimum ratio of detected line number spans to total lines required to treat them as line numbers.",
 30 |     ] = .6
 31 | 
 32 |     def __init__(self, config):
 33 |         super().__init__(config)
 34 | 
 35 |     def __call__(self, document: Document):
 36 |         self.ignore_line_number_spans(document)
 37 |         self.ignore_line_starts_ends(document)
 38 |         self.ignore_line_number_blocks(document)
 39 | 
 40 |     def ignore_line_number_spans(self, document: Document):
 41 |         for page in document.pages:
 42 |             line_count = 0
 43 |             line_number_spans = []
 44 |             for block in page.contained_blocks(document, (BlockTypes.Line,)):
 45 |                 if block.structure is None:
 46 |                     continue
 47 | 
 48 |                 line_count += 1
 49 |                 leftmost_span = None
 50 |                 for span in block.contained_blocks(document, (BlockTypes.Span,)):
 51 |                     if leftmost_span is None or span.polygon.x_start < leftmost_span.polygon.x_start:
 52 |                         leftmost_span = span
 53 | 
 54 |                 if leftmost_span is not None and leftmost_span.text.strip().isnumeric():
 55 |                     line_number_spans.append(leftmost_span)
 56 | 
 57 |             if line_count > 0 and len(line_number_spans) / line_count > self.min_line_number_span_ratio:
 58 |                 for span in line_number_spans:
 59 |                     span.ignore_for_output = True
 60 | 
 61 |     def ignore_line_number_blocks(self, document: Document):
 62 |         for page in document.pages:
 63 |             for block in page.contained_blocks(document, self.block_types):
 64 |                 raw_text = block.raw_text(document)
 65 |                 tokens = raw_text.strip().split()
 66 |                 if len(tokens) < 4:
 67 |                     continue
 68 | 
 69 |                 tokens_are_numbers = [token.isdigit() for token in tokens]
 70 |                 if all([
 71 |                     sum(tokens_are_numbers) / len(tokens) > self.strip_numbers_threshold,
 72 |                     block.polygon.height > block.polygon.width  # Ensure block is taller than it is wide, like vertical page numbers
 73 |                 ]):
 74 |                     block.ignore_for_output = True
 75 | 
 76 |     def ignore_line_starts_ends(self, document: Document):
 77 |         for page in document.pages:
 78 |             for block in page.contained_blocks(document, self.block_types):
 79 |                 if block.structure is None:
 80 |                     continue
 81 | 
 82 |                 all_lines = block.structure_blocks(document)
 83 |                 if len(all_lines) < self.min_lines_in_block:
 84 |                     continue
 85 | 
 86 |                 starts_with_number = []
 87 |                 ends_with_number = []
 88 |                 for line in all_lines:
 89 |                     spans = line.structure_blocks(document)
 90 |                     if len(spans) < 2:
 91 |                         starts_with_number.append(False)
 92 |                         ends_with_number.append(False)
 93 |                         continue
 94 | 
 95 |                     raw_text = line.raw_text(document)
 96 |                     starts = all([
 97 |                         spans[0].text.strip().isdigit(),
 98 |                         len(raw_text) - len(spans[0].text.strip()) > self.min_line_length
 99 |                     ])
100 | 
101 |                     ends = all([
102 |                         spans[-1].text.strip().isdigit(),
103 |                         len(raw_text) - len(spans[-1].text.strip()) > self.min_line_length
104 |                     ])
105 | 
106 |                     starts_with_number.append(starts)
107 |                     ends_with_number.append(ends)
108 | 
109 |                 if sum(starts_with_number) / len(starts_with_number) > self.strip_numbers_threshold:
110 |                     for starts, line in zip(starts_with_number, all_lines):
111 |                         if starts:
112 |                             span = page.get_block(line.structure[0])
113 |                             span.ignore_for_output = True
114 | 
115 |                 if sum(ends_with_number) / len(ends_with_number) > self.strip_numbers_threshold:
116 |                     for ends, line in zip(ends_with_number, all_lines):
117 |                         if ends:
118 |                             span = page.get_block(line.structure[-1])
119 |                             span.ignore_for_output = True
120 | 
```

--------------------------------------------------------------------------------
/marker/scripts/streamlit_app.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | 
  3 | from marker.scripts.common import (
  4 |     load_models,
  5 |     parse_args,
  6 |     img_to_html,
  7 |     get_page_image,
  8 |     page_count,
  9 | )
 10 | 
 11 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 12 | os.environ["IN_STREAMLIT"] = "true"
 13 | 
 14 | from marker.settings import settings
 15 | from streamlit.runtime.uploaded_file_manager import UploadedFile
 16 | 
 17 | import re
 18 | import tempfile
 19 | from typing import Any, Dict
 20 | 
 21 | import streamlit as st
 22 | from PIL import Image
 23 | 
 24 | from marker.converters.pdf import PdfConverter
 25 | from marker.config.parser import ConfigParser
 26 | from marker.output import text_from_rendered
 27 | 
 28 | 
 29 | def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[str, Any], dict):
 30 |     config_dict = config_parser.generate_config_dict()
 31 |     config_dict["pdftext_workers"] = 1
 32 |     converter_cls = PdfConverter
 33 |     converter = converter_cls(
 34 |         config=config_dict,
 35 |         artifact_dict=model_dict,
 36 |         processor_list=config_parser.get_processors(),
 37 |         renderer=config_parser.get_renderer(),
 38 |         llm_service=config_parser.get_llm_service(),
 39 |     )
 40 |     return converter(fname)
 41 | 
 42 | 
 43 | def markdown_insert_images(markdown, images):
 44 |     image_tags = re.findall(
 45 |         r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))',
 46 |         markdown,
 47 |     )
 48 | 
 49 |     for image in image_tags:
 50 |         image_markdown = image[0]
 51 |         image_alt = image[1]
 52 |         image_path = image[2]
 53 |         if image_path in images:
 54 |             markdown = markdown.replace(
 55 |                 image_markdown, img_to_html(images[image_path], image_alt)
 56 |             )
 57 |     return markdown
 58 | 
 59 | 
 60 | st.set_page_config(layout="wide")
 61 | col1, col2 = st.columns([0.5, 0.5])
 62 | 
 63 | model_dict = load_models()
 64 | cli_options = parse_args()
 65 | 
 66 | st.markdown("""
 67 | # Marker Demo
 68 | 
 69 | This app will let you try marker, a PDF or image -> Markdown, HTML, JSON converter. It works with any language, and extracts images, tables, equations, etc.
 70 | 
 71 | Find the project [here](https://github.com/VikParuchuri/marker).
 72 | """)
 73 | 
 74 | in_file: UploadedFile = st.sidebar.file_uploader(
 75 |     "PDF, document, or image file:",
 76 |     type=["pdf", "png", "jpg", "jpeg", "gif", "pptx", "docx", "xlsx", "html", "epub"],
 77 | )
 78 | 
 79 | if in_file is None:
 80 |     st.stop()
 81 | 
 82 | filetype = in_file.type
 83 | 
 84 | with col1:
 85 |     page_count = page_count(in_file)
 86 |     page_number = st.number_input(
 87 |         f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count
 88 |     )
 89 |     pil_image = get_page_image(in_file, page_number)
 90 |     st.image(pil_image, use_container_width=True)
 91 | 
 92 | page_range = st.sidebar.text_input(
 93 |     "Page range to parse, comma separated like 0,5-10,20",
 94 |     value=f"{page_number}-{page_number}",
 95 | )
 96 | output_format = st.sidebar.selectbox(
 97 |     "Output format", ["markdown", "json", "html", "chunks"], index=0
 98 | )
 99 | run_marker = st.sidebar.button("Run Marker")
100 | 
101 | use_llm = st.sidebar.checkbox(
102 |     "Use LLM", help="Use LLM for higher quality processing", value=False
103 | )
104 | force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
105 | strip_existing_ocr = st.sidebar.checkbox(
106 |     "Strip existing OCR",
107 |     help="Strip existing OCR text from the PDF and re-OCR.",
108 |     value=False,
109 | )
110 | debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
111 | disable_ocr_math = st.sidebar.checkbox(
112 |     "Disable math",
113 |     help="Disable math in OCR output - no inline math",
114 |     value=False,
115 | )
116 | 
117 | if not run_marker:
118 |     st.stop()
119 | 
120 | # Run Marker
121 | with tempfile.TemporaryDirectory() as tmp_dir:
122 |     temp_pdf = os.path.join(tmp_dir, "temp.pdf")
123 |     with open(temp_pdf, "wb") as f:
124 |         f.write(in_file.getvalue())
125 | 
126 |     cli_options.update(
127 |         {
128 |             "output_format": output_format,
129 |             "page_range": page_range,
130 |             "force_ocr": force_ocr,
131 |             "debug": debug,
132 |             "output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
133 |             "use_llm": use_llm,
134 |             "strip_existing_ocr": strip_existing_ocr,
135 |             "disable_ocr_math": disable_ocr_math,
136 |         }
137 |     )
138 |     config_parser = ConfigParser(cli_options)
139 |     rendered = convert_pdf(temp_pdf, config_parser)
140 |     page_range = config_parser.generate_config_dict()["page_range"]
141 |     first_page = page_range[0] if page_range else 0
142 | 
143 | text, ext, images = text_from_rendered(rendered)
144 | with col2:
145 |     if output_format == "markdown":
146 |         text = markdown_insert_images(text, images)
147 |         st.markdown(text, unsafe_allow_html=True)
148 |     elif output_format == "json":
149 |         st.json(text)
150 |     elif output_format == "html":
151 |         st.html(text)
152 |     elif output_format == "chunks":
153 |         st.json(text)
154 | 
155 | if debug:
156 |     with col1:
157 |         debug_data_path = rendered.metadata.get("debug_data_path")
158 |         if debug_data_path:
159 |             pdf_image_path = os.path.join(debug_data_path, f"pdf_page_{first_page}.png")
160 |             img = Image.open(pdf_image_path)
161 |             st.image(img, caption="PDF debug image", use_container_width=True)
162 |             layout_image_path = os.path.join(
163 |                 debug_data_path, f"layout_page_{first_page}.png"
164 |             )
165 |             img = Image.open(layout_image_path)
166 |             st.image(img, caption="Layout debug image", use_container_width=True)
167 |         st.write("Raw output:")
168 |         st.code(text, language=output_format)
169 | 
```

--------------------------------------------------------------------------------
/benchmarks/throughput/main.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import tempfile
  3 | import time
  4 | from multiprocessing import get_context
  5 | from concurrent.futures import ProcessPoolExecutor
  6 | import torch
  7 | 
  8 | import click
  9 | import pypdfium2 as pdfium
 10 | from tqdm import tqdm
 11 | 
 12 | import datasets
 13 | 
 14 | 
 15 | def get_next_pdf(ds: datasets.Dataset, i: int):
 16 |     while True:
 17 |         pdf = ds[i]["pdf"]
 18 |         filename = ds[i]["filename"]
 19 |         if pdf and filename.endswith(".pdf"):
 20 |             return pdf, filename, i + 1
 21 |         i += 1
 22 |         if i >= len(ds):
 23 |             i = 0
 24 | 
 25 | 
 26 | def single_batch(
 27 |     batch_size: int,
 28 |     num_threads: int,
 29 |     force_ocr: bool,
 30 |     quantize: bool,
 31 |     compile: bool,
 32 |     worker_id: int,
 33 |     chunksize: int = 100,
 34 | ):
 35 |     if quantize:
 36 |         os.environ["RECOGNITION_MODEL_QUANTIZE"] = "true"
 37 |     if compile:
 38 |         os.environ["COMPILE_ALL"] = "true"
 39 | 
 40 |     for item in [
 41 |         "DETECTOR_POSTPROCESSING_CPU_WORKERS",
 42 |         "OPENBLAS_NUM_THREADS",
 43 |         "PDFTEXT_CPU_WORKERS",
 44 |         "OMP_NUM_THREADS",
 45 |     ]:
 46 |         os.environ[item] = f"{num_threads}"
 47 | 
 48 |     torch.set_num_threads(num_threads)
 49 | 
 50 |     from marker.converters.pdf import PdfConverter
 51 |     from marker.models import create_model_dict
 52 |     from marker.output import text_from_rendered
 53 | 
 54 |     ds = datasets.load_dataset("datalab-to/pdfs", split="train")
 55 |     model_dict = create_model_dict()
 56 |     torch.cuda.reset_peak_memory_stats()
 57 | 
 58 |     times = []
 59 |     i = 0
 60 |     pages = 0
 61 |     chars = 0
 62 | 
 63 |     min_time = time.time()
 64 |     for _ in range(batch_size):
 65 |         pdf, fname, i = get_next_pdf(ds, i)
 66 |         print(f"Inferencing {fname} on worker {worker_id}...")
 67 | 
 68 |         pdf_doc = pdfium.PdfDocument(pdf)
 69 |         page_count = len(pdf_doc)
 70 |         pdf_doc.close()
 71 |         pages += page_count
 72 | 
 73 |         with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
 74 |             f.write(pdf)
 75 |             f.flush()
 76 |             page_range_chunks = list(range(0, page_count, chunksize))
 77 |             for chunk_start in page_range_chunks:
 78 |                 chunk_end = min(chunk_start + chunksize, page_count)
 79 |                 page_range = list(range(chunk_start, chunk_end))
 80 | 
 81 |                 block_converter = PdfConverter(
 82 |                     artifact_dict=model_dict,
 83 |                     config={
 84 |                         "disable_tqdm": worker_id > 0,
 85 |                         "page_range": page_range,
 86 |                         "force_ocr": force_ocr,
 87 |                     },
 88 |                 )
 89 |                 start = time.time()
 90 |                 rendered = block_converter(f.name)
 91 |                 markdown, _, _ = text_from_rendered(rendered)
 92 |                 chars += len(markdown)
 93 | 
 94 |                 total = time.time() - start
 95 |                 times.append(total)
 96 | 
 97 |     max_gpu_vram = torch.cuda.max_memory_reserved() / 1024**3
 98 |     max_time = time.time()
 99 |     return sum(times), min_time, max_time, max_gpu_vram, pages, chars
100 | 
101 | 
102 | @click.command(help="Benchmark PDF to MD conversion throughput.")
103 | @click.option("--workers", default=1, help="Number of workers to use.")
104 | @click.option("--batch_size", default=1, help="Batch size for inference.")
105 | @click.option("--force_ocr", is_flag=True, help="Force OCR on all pages.")
106 | @click.option("--quantize", is_flag=True, help="Use quantized model.")
107 | @click.option("--compile", is_flag=True, help="Use compiled model.")
108 | def main(
109 |     workers: int,
110 |     batch_size: int,
111 |     force_ocr: bool,
112 |     quantize: bool,
113 |     compile: bool,
114 | ):
115 |     total_cpus = os.cpu_count()
116 |     start = time.time()
117 |     current_gpu_vram = torch.cuda.memory_reserved() / 1024**3
118 |     with ProcessPoolExecutor(
119 |         max_workers=workers, mp_context=get_context("spawn")
120 |     ) as executor:
121 |         cpus_per_worker = min(8, max(2, total_cpus // workers))
122 |         futures = [
123 |             executor.submit(
124 |                 single_batch,
125 |                 batch_size,
126 |                 cpus_per_worker,
127 |                 force_ocr,
128 |                 quantize,
129 |                 compile,
130 |                 i,
131 |             )
132 |             for i in range(workers)
133 |         ]
134 |         all_times = []
135 |         min_time = None
136 |         max_time = time.time()
137 |         vrams = []
138 |         page_count = 0
139 |         char_count = 0
140 |         for future in tqdm(futures, desc="Running marker workers..."):
141 |             times, min_time_worker, max_time_worker, max_vram, pages, chars = (
142 |                 future.result()
143 |             )
144 |             vrams.append(max_vram - current_gpu_vram)
145 |             all_times.append(times)
146 |             page_count += pages
147 |             char_count += chars
148 |             min_time = (
149 |                 min(min_time_worker, min_time)
150 |                 if min_time is not None
151 |                 else min_time_worker
152 |             )
153 |             max_time = max(max_time, max_time_worker)
154 | 
155 |     end = time.time() - start
156 |     all_worker_time = max_time - min_time
157 | 
158 |     print(f"Average time per worker: {sum(all_times) / len(all_times)}")
159 |     print(f"Max time per worker: {max(all_times)}")
160 |     print(f"End to end time (counting model loading), all processes: {end}")
161 |     print(f"End to end time (no model loading), all processes: {all_worker_time}")
162 |     print(f"Total pages: {page_count}")
163 |     print(f"Total characters: {char_count}")
164 |     print(f"Time per page: {all_worker_time / page_count:.2f}")
165 |     print(f"Characters per second: {char_count / all_worker_time:.2f}")
166 |     print(f"Max GPU VRAM: {max(vrams):.2f} GB")
167 |     print(f"Average GPU VRAM: {sum(vrams) / len(vrams):.2f} GB")
168 | 
169 | 
170 | if __name__ == "__main__":
171 |     main()
172 | 
```

--------------------------------------------------------------------------------
/marker/renderers/__init__.py:
--------------------------------------------------------------------------------

```python
  1 | import base64
  2 | import io
  3 | import re
  4 | from collections import Counter
  5 | from typing import Annotated, Optional, Tuple, Literal
  6 | 
  7 | from bs4 import BeautifulSoup
  8 | from pydantic import BaseModel
  9 | 
 10 | from marker.schema import BlockTypes
 11 | from marker.schema.blocks.base import BlockId, BlockOutput
 12 | from marker.schema.document import Document
 13 | from marker.settings import settings
 14 | from marker.util import assign_config
 15 | 
 16 | 
 17 | class BaseRenderer:
 18 |     image_blocks: Annotated[
 19 |         Tuple[BlockTypes, ...], "The block types to consider as images."
 20 |     ] = (BlockTypes.Picture, BlockTypes.Figure)
 21 |     extract_images: Annotated[bool, "Extract images from the document."] = True
 22 |     image_extraction_mode: Annotated[
 23 |         Literal["lowres", "highres"],
 24 |         "The mode to use for extracting images.",
 25 |     ] = "highres"
 26 |     keep_pageheader_in_output: Annotated[
 27 |         bool, "Keep the page header in the output HTML."
 28 |     ] = False
 29 |     keep_pagefooter_in_output: Annotated[
 30 |         bool, "Keep the page footer in the output HTML."
 31 |     ] = False
 32 |     add_block_ids: Annotated[bool, "Whether to add block IDs to the output HTML."] = (
 33 |         False
 34 |     )
 35 | 
 36 |     def __init__(self, config: Optional[BaseModel | dict] = None):
 37 |         assign_config(self, config)
 38 | 
 39 |         self.block_config = {
 40 |             "keep_pageheader_in_output": self.keep_pageheader_in_output,
 41 |             "keep_pagefooter_in_output": self.keep_pagefooter_in_output,
 42 |             "add_block_ids": self.add_block_ids,
 43 |         }
 44 | 
 45 |     def __call__(self, document):
 46 |         # Children are in reading order
 47 |         raise NotImplementedError
 48 | 
 49 |     def extract_image(self, document: Document, image_id, to_base64=False):
 50 |         image_block = document.get_block(image_id)
 51 |         cropped = image_block.get_image(
 52 |             document, highres=self.image_extraction_mode == "highres"
 53 |         )
 54 | 
 55 |         if to_base64:
 56 |             image_buffer = io.BytesIO()
 57 |             # RGBA to RGB
 58 |             if not cropped.mode == "RGB":
 59 |                 cropped = cropped.convert("RGB")
 60 | 
 61 |             cropped.save(image_buffer, format=settings.OUTPUT_IMAGE_FORMAT)
 62 |             cropped = base64.b64encode(image_buffer.getvalue()).decode(
 63 |                 settings.OUTPUT_ENCODING
 64 |             )
 65 |         return cropped
 66 | 
 67 |     @staticmethod
 68 |     def merge_consecutive_math(html, tag="math"):
 69 |         if not html:
 70 |             return html
 71 |         pattern = rf"-</{tag}>(\s*)<{tag}>"
 72 |         html = re.sub(pattern, " ", html)
 73 | 
 74 |         pattern = rf'-</{tag}>(\s*)<{tag} display="inline">'
 75 |         html = re.sub(pattern, " ", html)
 76 |         return html
 77 | 
 78 |     @staticmethod
 79 |     def merge_consecutive_tags(html, tag):
 80 |         if not html:
 81 |             return html
 82 | 
 83 |         def replace_whitespace(match):
 84 |             whitespace = match.group(1)
 85 |             if len(whitespace) == 0:
 86 |                 return ""
 87 |             else:
 88 |                 return " "
 89 | 
 90 |         pattern = rf"</{tag}>(\s*)<{tag}>"
 91 | 
 92 |         while True:
 93 |             new_merged = re.sub(pattern, replace_whitespace, html)
 94 |             if new_merged == html:
 95 |                 break
 96 |             html = new_merged
 97 | 
 98 |         return html
 99 | 
100 |     def generate_page_stats(self, document: Document, document_output):
101 |         page_stats = []
102 |         for page in document.pages:
103 |             block_counts = Counter(
104 |                 [str(block.block_type) for block in page.children]
105 |             ).most_common()
106 |             block_metadata = page.aggregate_block_metadata()
107 |             page_stats.append(
108 |                 {
109 |                     "page_id": page.page_id,
110 |                     "text_extraction_method": page.text_extraction_method,
111 |                     "block_counts": block_counts,
112 |                     "block_metadata": block_metadata.model_dump(),
113 |                 }
114 |             )
115 |         return page_stats
116 | 
117 |     def generate_document_metadata(self, document: Document, document_output):
118 |         metadata = {
119 |             "table_of_contents": document.table_of_contents,
120 |             "page_stats": self.generate_page_stats(document, document_output),
121 |         }
122 |         if document.debug_data_path is not None:
123 |             metadata["debug_data_path"] = document.debug_data_path
124 | 
125 |         return metadata
126 | 
127 |     def extract_block_html(self, document: Document, block_output: BlockOutput):
128 |         soup = BeautifulSoup(block_output.html, "html.parser")
129 | 
130 |         content_refs = soup.find_all("content-ref")
131 |         ref_block_id = None
132 |         images = {}
133 |         for ref in content_refs:
134 |             src = ref.get("src")
135 |             sub_images = {}
136 |             for item in block_output.children:
137 |                 if item.id == src:
138 |                     content, sub_images_ = self.extract_block_html(document, item)
139 |                     sub_images.update(sub_images_)
140 |                     ref_block_id: BlockId = item.id
141 |                     break
142 | 
143 |             if ref_block_id.block_type in self.image_blocks and self.extract_images:
144 |                 images[ref_block_id] = self.extract_image(
145 |                     document, ref_block_id, to_base64=True
146 |                 )
147 |             else:
148 |                 images.update(sub_images)
149 |                 ref.replace_with(BeautifulSoup(content, "html.parser"))
150 | 
151 |         if block_output.id.block_type in self.image_blocks and self.extract_images:
152 |             images[block_output.id] = self.extract_image(
153 |                 document, block_output.id, to_base64=True
154 |             )
155 | 
156 |         return str(soup), images
157 | 
```

--------------------------------------------------------------------------------
/marker/processors/llm/llm_sectionheader.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | from typing import List, Tuple
  3 | 
  4 | from tqdm import tqdm
  5 | 
  6 | from marker.logger import get_logger
  7 | from marker.processors.llm import BaseLLMComplexBlockProcessor
  8 | from marker.schema import BlockTypes
  9 | from marker.schema.blocks import Block
 10 | from marker.schema.document import Document
 11 | from marker.schema.groups import PageGroup
 12 | from pydantic import BaseModel
 13 | 
 14 | logger = get_logger()
 15 | 
 16 | 
 17 | class LLMSectionHeaderProcessor(BaseLLMComplexBlockProcessor):
 18 |     page_prompt = """You're a text correction expert specializing in accurately analyzing complex PDF documents. You will be given a list of all of the section headers from a document, along with their page number and approximate dimensions.  The headers will be formatted like below, and will be presented in order.
 19 | 
 20 | ```json
 21 | [
 22 |     {
 23 |         "bbox": [x1, y1, x2, y2],
 24 |         "width": x2 - x1,
 25 |         "height": y2 - y1,
 26 |         "page": 0,
 27 |         "id": "/page/0/SectionHeader/1",
 28 |         "html": "<h1>Introduction</h1>",
 29 |     }, ...
 30 | ]
 31 | ```
 32 | 
 33 | Bboxes have been normalized to 0-1000.
 34 | 
 35 | Your goal is to make sure that the section headers have the correct levels (h1, h2, h3, h4, h5, or h6).  If a section header does not have the right level, edit the html to fix it.
 36 | 
 37 | Guidelines:
 38 | - Edit the blocks to ensure that the section headers have the correct levels.
 39 | - Only edit the h1, h2, h3, h4, h5, and h6 tags.  Do not change any other tags or content in the headers.
 40 | - Only output the headers that changed (if nothing changed, output nothing).
 41 | - Every header you output needs to have one and only one level tag (h1, h2, h3, h4, h5, or h6).
 42 | 
 43 | **Instructions:**
 44 | 1. Carefully examine the provided section headers and JSON.
 45 | 2. Identify any changes you'll need to make, and write a short analysis.
 46 | 3. Output "no_corrections", or "corrections_needed", depending on whether you need to make changes.
 47 | 4. If corrections are needed, output any blocks that need updates.  Only output the block ids and html, like this:
 48 |         ```json
 49 |         [
 50 |             {
 51 |                 "id": "/page/0/SectionHeader/1",
 52 |                 "html": "<h2>Introduction</h2>"
 53 |             },
 54 |             ...
 55 |         ]
 56 |         ```
 57 | 
 58 | **Example:**
 59 | Input:
 60 | Section Headers
 61 | ```json
 62 | [
 63 |     {
 64 |         "bbox": [x1, y1, x2, y2],
 65 |         "id": "/page/0/SectionHeader/1",
 66 |         "page": 0,
 67 |         "html": "1 Vector Operations",
 68 |     },
 69 |     {
 70 |         "bbox": [x1, y1, x2, y2],
 71 |         "id": "/page/0/SectionHeader/2",
 72 |         "page": 0,
 73 |         "html": "1.1 Vector Addition",
 74 |     },
 75 | ]
 76 | ```
 77 | Output:
 78 | Analysis: The first section header is missing the h1 tag, and the second section header is missing the h2 tag.
 79 | ```json
 80 | [
 81 |     {
 82 |         "id": "/page/0/SectionHeader/1",
 83 |         "html": "<h1>1 Vector Operations</h1>"
 84 |     },
 85 |     {
 86 |         "id": "/page/0/SectionHeader/2",
 87 |         "html": "<h2>1.1 Vector Addition</h2>"
 88 |     }
 89 | ]
 90 | ```
 91 | 
 92 | **Input:**
 93 | Section Headers
 94 | ```json
 95 | {{section_header_json}}
 96 | ```
 97 | """
 98 | 
 99 |     def get_selected_blocks(
100 |         self,
101 |         document: Document,
102 |         page: PageGroup,
103 |     ) -> List[dict]:
104 |         selected_blocks = page.structure_blocks(document)
105 |         json_blocks = [
106 |             self.normalize_block_json(block, document, page, i)
107 |             for i, block in enumerate(selected_blocks)
108 |         ]
109 |         return json_blocks
110 | 
111 |     def process_rewriting(
112 |         self, document: Document, section_headers: List[Tuple[Block, dict]]
113 |     ):
114 |         section_header_json = [sh[1] for sh in section_headers]
115 |         for item in section_header_json:
116 |             _, _, page_id, block_type, block_id = item["id"].split("/")
117 |             item["page"] = page_id
118 |             item["width"] = item["bbox"][2] - item["bbox"][0]
119 |             item["height"] = item["bbox"][3] - item["bbox"][1]
120 |             del item["block_type"]  # Not needed, since they're all section headers
121 | 
122 |         prompt = self.page_prompt.replace(
123 |             "{{section_header_json}}", json.dumps(section_header_json)
124 |         )
125 |         response = self.llm_service(
126 |             prompt, None, document.pages[0], SectionHeaderSchema
127 |         )
128 |         logger.debug(f"Got section header reponse from LLM: {response}")
129 | 
130 |         if not response or "correction_type" not in response:
131 |             logger.warning("LLM did not return a valid response")
132 |             return
133 | 
134 |         correction_type = response["correction_type"]
135 |         if correction_type == "no_corrections":
136 |             return
137 | 
138 |         self.load_blocks(response)
139 |         self.handle_rewrites(response["blocks"], document)
140 | 
141 |     def load_blocks(self, response):
142 |         if isinstance(response["blocks"], str):
143 |             response["blocks"] = json.loads(response["blocks"])
144 | 
145 |     def rewrite_blocks(self, document: Document):
146 |         # Don't show progress if there are no blocks to process
147 |         section_headers = [
148 |             (block, self.normalize_block_json(block, document, page))
149 |             for page in document.pages
150 |             for block in page.structure_blocks(document)
151 |             if block.block_type == BlockTypes.SectionHeader
152 |         ]
153 |         if len(section_headers) == 0:
154 |             return
155 | 
156 |         pbar = tqdm(
157 |             total=1,
158 |             desc=f"Running {self.__class__.__name__}",
159 |             disable=self.disable_tqdm,
160 |         )
161 | 
162 |         self.process_rewriting(document, section_headers)
163 |         pbar.update(1)
164 |         pbar.close()
165 | 
166 | 
167 | class BlockSchema(BaseModel):
168 |     id: str
169 |     html: str
170 | 
171 | 
172 | class SectionHeaderSchema(BaseModel):
173 |     analysis: str
174 |     correction_type: str
175 |     blocks: List[BlockSchema]
176 | 
```

--------------------------------------------------------------------------------
/marker/renderers/html.py:
--------------------------------------------------------------------------------

```python
  1 | import textwrap
  2 | 
  3 | from PIL import Image
  4 | from typing import Annotated, Tuple
  5 | 
  6 | from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
  7 | from pydantic import BaseModel
  8 | 
  9 | from marker.renderers import BaseRenderer
 10 | from marker.schema import BlockTypes
 11 | from marker.schema.blocks import BlockId
 12 | from marker.settings import settings
 13 | 
 14 | # Ignore beautifulsoup warnings
 15 | import warnings
 16 | 
 17 | warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
 18 | 
 19 | # Suppress DecompressionBombError
 20 | Image.MAX_IMAGE_PIXELS = None
 21 | 
 22 | 
 23 | class HTMLOutput(BaseModel):
 24 |     html: str
 25 |     images: dict
 26 |     metadata: dict
 27 | 
 28 | 
 29 | class HTMLRenderer(BaseRenderer):
 30 |     """
 31 |     A renderer for HTML output.
 32 |     """
 33 | 
 34 |     page_blocks: Annotated[
 35 |         Tuple[BlockTypes],
 36 |         "The block types to consider as pages.",
 37 |     ] = (BlockTypes.Page,)
 38 |     paginate_output: Annotated[
 39 |         bool,
 40 |         "Whether to paginate the output.",
 41 |     ] = False
 42 | 
 43 |     def extract_image(self, document, image_id):
 44 |         image_block = document.get_block(image_id)
 45 |         cropped = image_block.get_image(
 46 |             document, highres=self.image_extraction_mode == "highres"
 47 |         )
 48 |         return cropped
 49 | 
 50 |     def insert_block_id(self, soup, block_id: BlockId):
 51 |         """
 52 |         Insert a block ID into the soup as a data attribute.
 53 |         """
 54 |         if block_id.block_type in [BlockTypes.Line, BlockTypes.Span]:
 55 |             return soup
 56 | 
 57 |         if self.add_block_ids:
 58 |             # Find the outermost tag (first tag that isn't a NavigableString)
 59 |             outermost_tag = None
 60 |             for element in soup.contents:
 61 |                 if hasattr(element, "name") and element.name:
 62 |                     outermost_tag = element
 63 |                     break
 64 | 
 65 |             # If we found an outermost tag, add the data-block-id attribute
 66 |             if outermost_tag:
 67 |                 outermost_tag["data-block-id"] = str(block_id)
 68 | 
 69 |             # If soup only contains text or no tags, wrap in a span
 70 |             elif soup.contents:
 71 |                 wrapper = soup.new_tag("span")
 72 |                 wrapper["data-block-id"] = str(block_id)
 73 | 
 74 |                 contents = list(soup.contents)
 75 |                 for content in contents:
 76 |                     content.extract()
 77 |                     wrapper.append(content)
 78 |                 soup.append(wrapper)
 79 |         return soup
 80 | 
 81 |     def extract_html(self, document, document_output, level=0):
 82 |         soup = BeautifulSoup(document_output.html, "html.parser")
 83 | 
 84 |         content_refs = soup.find_all("content-ref")
 85 |         ref_block_id = None
 86 |         images = {}
 87 |         for ref in content_refs:
 88 |             src = ref.get("src")
 89 |             sub_images = {}
 90 |             content = ""
 91 |             for item in document_output.children:
 92 |                 if item.id == src:
 93 |                     content, sub_images_ = self.extract_html(document, item, level + 1)
 94 |                     sub_images.update(sub_images_)
 95 |                     ref_block_id: BlockId = item.id
 96 |                     break
 97 | 
 98 |             if ref_block_id.block_type in self.image_blocks:
 99 |                 if self.extract_images:
100 |                     image = self.extract_image(document, ref_block_id)
101 |                     image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
102 |                     images[image_name] = image
103 |                     element = BeautifulSoup(
104 |                         f"<p>{content}<img src='{image_name}'></p>", "html.parser"
105 |                     )
106 |                     ref.replace_with(self.insert_block_id(element, ref_block_id))
107 |                 else:
108 |                     # This will be the image description if using llm mode, or empty if not
109 |                     element = BeautifulSoup(f"{content}", "html.parser")
110 |                     ref.replace_with(self.insert_block_id(element, ref_block_id))
111 |             elif ref_block_id.block_type in self.page_blocks:
112 |                 images.update(sub_images)
113 |                 if self.paginate_output:
114 |                     content = f"<div class='page' data-page-id='{ref_block_id.page_id}'>{content}</div>"
115 |                 element = BeautifulSoup(f"{content}", "html.parser")
116 |                 ref.replace_with(self.insert_block_id(element, ref_block_id))
117 |             else:
118 |                 images.update(sub_images)
119 |                 element = BeautifulSoup(f"{content}", "html.parser")
120 |                 ref.replace_with(self.insert_block_id(element, ref_block_id))
121 | 
122 |         output = str(soup)
123 |         if level == 0:
124 |             output = self.merge_consecutive_tags(output, "b")
125 |             output = self.merge_consecutive_tags(output, "i")
126 |             output = self.merge_consecutive_math(
127 |                 output
128 |             )  # Merge consecutive inline math tags
129 |             output = textwrap.dedent(f"""
130 |             <!DOCTYPE html>
131 |             <html>
132 |                 <head>
133 |                     <meta charset="utf-8" />
134 |                 </head>
135 |                 <body>
136 |                     {output}
137 |                 </body>
138 |             </html>
139 | """)
140 | 
141 |         return output, images
142 | 
143 |     def __call__(self, document) -> HTMLOutput:
144 |         document_output = document.render(self.block_config)
145 |         full_html, images = self.extract_html(document, document_output)
146 |         soup = BeautifulSoup(full_html, "html.parser")
147 |         full_html = soup.prettify()  # Add indentation to the HTML
148 |         return HTMLOutput(
149 |             html=full_html,
150 |             images=images,
151 |             metadata=self.generate_document_metadata(document, document_output),
152 |         )
153 | 
```

--------------------------------------------------------------------------------
/marker/extractors/page.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | from concurrent.futures import ThreadPoolExecutor
  3 | 
  4 | from pydantic import BaseModel
  5 | from typing import Annotated, Optional, List
  6 | 
  7 | from tqdm import tqdm
  8 | 
  9 | from marker.extractors import BaseExtractor
 10 | from marker.logger import get_logger
 11 | 
 12 | logger = get_logger()
 13 | 
 14 | 
 15 | class PageExtractionSchema(BaseModel):
 16 |     description: str
 17 |     detailed_notes: str
 18 | 
 19 | 
 20 | class PageExtractor(BaseExtractor):
 21 |     """
 22 |     An extractor that pulls data from a single page.
 23 |     """
 24 | 
 25 |     extraction_page_chunk_size: Annotated[
 26 |         int, "The number of pages to chunk together for extraction."
 27 |     ] = 3
 28 | 
 29 |     page_schema: Annotated[
 30 |         str,
 31 |         "The JSON schema to be extracted from the page.",
 32 |     ] = ""
 33 | 
 34 |     page_extraction_prompt = """You are an expert document analyst who reads documents and pulls data out in JSON format. You will receive the markdown representation of a document page, and a JSON schema that we want to extract from the document. Your task is to write detailed notes on this page, so that when you look at all your notes from across the document, you can fill in the schema.
 35 |     
 36 | Some notes:
 37 | - The schema may contain a single object to extract from the entire document, or an array of objects. 
 38 | - The schema may contain nested objects, arrays, and other complex structures.
 39 | 
 40 | Some guidelines:
 41 | - Write very thorough notes, and include specific JSON snippets that can be extracted from the page.
 42 | - You may need information from prior or subsequent pages to fully fill in the schema, so make sure to write detailed notes that will let you join entities across pages later on.
 43 | - Estimate your confidence in the values you extract, so you can reconstruct the JSON later when you only have your notes.
 44 | - Some tables and other data structures may continue on a subsequent page, so make sure to store the positions that data comes from where appropriate.
 45 | 
 46 | **Instructions:**
 47 | 1. Analyze the provided markdown representation of the page.
 48 | 2. Analyze the JSON schema.
 49 | 3. Write a short description of the fields in the schema, and the associated values in the markdown.
 50 | 4. Write detailed notes on the page, including any values that can be extracted from the markdown.  Include snippets of JSON that can be extracted from the page where possible.
 51 | 
 52 | **Example:**
 53 | Input:
 54 | 
 55 | Markdown
 56 | ```markdown
 57 | | Make   | Sales |
 58 | |--------|-------|
 59 | | Honda  | 100   |
 60 | | Toyota | 200   |
 61 | ```
 62 | 
 63 | Schema
 64 | 
 65 | ```json
 66 | {'$defs': {'Cars': {'properties': {'make': {'title': 'Make', 'type': 'string'}, 'sales': {'title': 'Sales', 'type': 'integer'}, 'color': {'title': 'Color', 'type': 'string'}}, 'required': ['make', 'sales', 'color'], 'title': 'Cars', 'type': 'object'}}, 'properties': {'cars': {'items': {'$ref': '#/$defs/Cars'}, 'title': 'Cars', 'type': 'array'}}, 'required': ['cars'], 'title': 'CarsList', 'type': 'object'}
 67 | ```
 68 | 
 69 | Output:
 70 | 
 71 | Description: The schema has a list of cars, each with a make, sales, and color. The image and markdown contain a table with 2 cars: Honda with 100 sales and Toyota with 200 sales. The color is not present in the table.
 72 | Detailed Notes: On this page, I see a table with car makes and sales. The makes are Honda and Toyota, with sales of 100 and 200 respectively. The color is not present in the table, so I will leave it blank in the JSON.  That information may be present on another page.  Some JSON snippets I may find useful later are:
 73 | ```json
 74 | {
 75 |     "make": "Honda",
 76 |     "sales": 100,
 77 | }
 78 | ```
 79 | ```json
 80 | {
 81 |     "make": "Toyota",
 82 |     "sales": 200,
 83 | }
 84 | ```
 85 | 
 86 | Honda is the first row in the table, and Toyota is the second row.  Make is the first column, and sales is the second.
 87 | 
 88 | **Input:**
 89 | 
 90 | Markdown
 91 | ```markdown
 92 | {{page_md}}
 93 | ```
 94 | 
 95 | Schema
 96 | ```json
 97 | {{schema}}
 98 | ```
 99 | """
100 | 
101 |     def chunk_page_markdown(self, page_markdown: List[str]) -> List[str]:
102 |         """
103 |         Chunk the page markdown into smaller pieces for processing.
104 |         """
105 | 
106 |         chunks = []
107 |         for i in range(0, len(page_markdown), self.extraction_page_chunk_size):
108 |             chunk = page_markdown[i : i + self.extraction_page_chunk_size]
109 |             chunks.append("\n\n".join(chunk))
110 | 
111 |         return chunks
112 | 
113 |     def inference_single_chunk(
114 |         self, page_markdown: str
115 |     ) -> Optional[PageExtractionSchema]:
116 |         prompt = self.page_extraction_prompt.replace(
117 |             "{{page_md}}", page_markdown
118 |         ).replace("{{schema}}", json.dumps(self.page_schema))
119 |         response = self.llm_service(prompt, None, None, PageExtractionSchema)
120 |         logger.debug(f"Page extraction response: {response}")
121 | 
122 |         if not response or any(
123 |             [
124 |                 key not in response
125 |                 for key in [
126 |                     "description",
127 |                     "detailed_notes",
128 |                 ]
129 |             ]
130 |         ):
131 |             return None
132 | 
133 |         return PageExtractionSchema(
134 |             description=response["description"],
135 |             detailed_notes=response["detailed_notes"],
136 |         )
137 | 
138 |     def __call__(
139 |         self,
140 |         page_markdown: List[str],
141 |         **kwargs,
142 |     ) -> List[PageExtractionSchema]:
143 |         if not self.page_schema:
144 |             raise ValueError(
145 |                 "Page schema must be defined for structured extraction to work."
146 |             )
147 | 
148 |         chunks = self.chunk_page_markdown(page_markdown)
149 |         results = []
150 |         pbar = tqdm(
151 |             desc="Running page extraction",
152 |             disable=self.disable_tqdm,
153 |             total=len(chunks),
154 |         )
155 | 
156 |         with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
157 |             for future in [
158 |                 executor.submit(self.inference_single_chunk, chunk) for chunk in chunks
159 |             ]:
160 |                 results.append(future.result())  # Raise exceptions if any occurred
161 |                 pbar.update(1)
162 | 
163 |         pbar.close()
164 |         return results
165 | 
```

--------------------------------------------------------------------------------
/marker/scripts/common.py:
--------------------------------------------------------------------------------

```python
  1 | import ast
  2 | import base64
  3 | import io
  4 | import re
  5 | import sys
  6 | from typing import Optional
  7 | 
  8 | from PIL import Image
  9 | import click
 10 | import pypdfium2
 11 | import streamlit as st
 12 | from pydantic import BaseModel
 13 | from streamlit.runtime.uploaded_file_manager import UploadedFile
 14 | 
 15 | from marker.config.parser import ConfigParser
 16 | from marker.config.printer import CustomClickPrinter
 17 | from marker.models import create_model_dict
 18 | from marker.settings import settings
 19 | 
 20 | 
 21 | @st.cache_data()
 22 | def parse_args():
 23 |     # Use to grab common cli options
 24 |     @ConfigParser.common_options
 25 |     def options_func():
 26 |         pass
 27 | 
 28 |     def extract_click_params(decorated_function):
 29 |         if hasattr(decorated_function, "__click_params__"):
 30 |             return decorated_function.__click_params__
 31 |         return []
 32 | 
 33 |     cmd = CustomClickPrinter("Marker app.")
 34 |     extracted_params = extract_click_params(options_func)
 35 |     cmd.params.extend(extracted_params)
 36 |     ctx = click.Context(cmd)
 37 |     try:
 38 |         cmd_args = sys.argv[1:]
 39 |         cmd.parse_args(ctx, cmd_args)
 40 |         return ctx.params
 41 |     except click.exceptions.ClickException as e:
 42 |         return {"error": str(e)}
 43 | 
 44 | 
 45 | @st.cache_resource()
 46 | def load_models():
 47 |     return create_model_dict()
 48 | 
 49 | 
 50 | def open_pdf(pdf_file):
 51 |     stream = io.BytesIO(pdf_file.getvalue())
 52 |     return pypdfium2.PdfDocument(stream)
 53 | 
 54 | 
 55 | def img_to_html(img, img_alt):
 56 |     img_bytes = io.BytesIO()
 57 |     img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT)
 58 |     img_bytes = img_bytes.getvalue()
 59 |     encoded = base64.b64encode(img_bytes).decode()
 60 |     img_html = f'<img src="data:image/{settings.OUTPUT_IMAGE_FORMAT.lower()};base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
 61 |     return img_html
 62 | 
 63 | 
 64 | @st.cache_data()
 65 | def get_page_image(pdf_file, page_num, dpi=96):
 66 |     if "pdf" in pdf_file.type:
 67 |         doc = open_pdf(pdf_file)
 68 |         page = doc[page_num]
 69 |         png_image = (
 70 |             page.render(
 71 |                 scale=dpi / 72,
 72 |             )
 73 |             .to_pil()
 74 |             .convert("RGB")
 75 |         )
 76 |     else:
 77 |         png_image = Image.open(pdf_file).convert("RGB")
 78 |     return png_image
 79 | 
 80 | 
 81 | @st.cache_data()
 82 | def page_count(pdf_file: UploadedFile):
 83 |     if "pdf" in pdf_file.type:
 84 |         doc = open_pdf(pdf_file)
 85 |         return len(doc) - 1
 86 |     else:
 87 |         return 1
 88 | 
 89 | 
 90 | def pillow_image_to_base64_string(img: Image) -> str:
 91 |     buffered = io.BytesIO()
 92 |     img.save(buffered, format="JPEG")
 93 |     return base64.b64encode(buffered.getvalue()).decode("utf-8")
 94 | 
 95 | 
 96 | def extract_root_pydantic_class(schema_code: str) -> Optional[str]:
 97 |     try:
 98 |         # Parse the code into an AST
 99 |         tree = ast.parse(schema_code)
100 | 
101 |         # Find all class definitions that inherit from BaseModel
102 |         class_names = set()
103 |         class_info = {}  # Store information about each class
104 | 
105 |         for node in ast.walk(tree):
106 |             if isinstance(node, ast.ClassDef):
107 |                 # Check if this class inherits from BaseModel
108 |                 is_pydantic = False
109 |                 for base in node.bases:
110 |                     if isinstance(base, ast.Name) and base.id == "BaseModel":
111 |                         is_pydantic = True
112 |                         break
113 | 
114 |                 if is_pydantic:
115 |                     class_names.add(node.name)
116 |                     class_info[node.name] = {
117 |                         "references": set(),  # Classes this class references
118 |                         "fields": [],  # Field names in this class
119 |                     }
120 | 
121 |                     # Extract field information
122 |                     for item in node.body:
123 |                         if isinstance(item, ast.AnnAssign) and isinstance(
124 |                             item.target, ast.Name
125 |                         ):
126 |                             field_name = item.target.id
127 |                             class_info[node.name]["fields"].append(field_name)
128 | 
129 |                             # Check if this field references another class
130 |                             annotation_str = ast.unparse(item.annotation)
131 | 
132 |                             # Look for List[ClassName], Optional[ClassName], Dict[Any, ClassName], etc.
133 |                             for other_class in class_names:
134 |                                 pattern = rf"(?:List|Dict|Set|Tuple|Optional|Union)?\[.*{other_class}.*\]|{other_class}"
135 |                                 if re.search(pattern, annotation_str):
136 |                                     class_info[node.name]["references"].add(other_class)
137 | 
138 |         if len(class_names) == 1:
139 |             return list(class_names)[0]
140 | 
141 |         referenced_classes = set()
142 |         for class_name, info in class_info.items():
143 |             referenced_classes.update(info["references"])
144 | 
145 |         # Find classes that reference others but aren't referenced themselves (potential roots)
146 |         root_candidates = set()
147 |         for class_name, info in class_info.items():
148 |             if info["references"] and class_name not in referenced_classes:
149 |                 root_candidates.add(class_name)
150 | 
151 |         # If we found exactly one root candidate, return it
152 |         if len(root_candidates) == 1:
153 |             return list(root_candidates)[0]
154 | 
155 |         return None
156 |     except Exception as e:
157 |         print(f"Error parsing schema: {e}")
158 |         return None
159 | 
160 | 
161 | def get_root_class(schema_code: str) -> Optional[BaseModel]:
162 |     root_class_name = extract_root_pydantic_class(schema_code)
163 | 
164 |     if not root_class_name:
165 |         return None
166 | 
167 |     if "from pydantic" not in schema_code:
168 |         schema_code = "from pydantic import BaseModel\n" + schema_code
169 |     if "from typing" not in schema_code:
170 |         schema_code = (
171 |             "from typing import List, Dict, Optional, Set, Tuple, Union, Any\n\n"
172 |             + schema_code
173 |         )
174 | 
175 |     # Execute the code in a new namespace
176 |     namespace = {}
177 |     exec(schema_code, namespace)
178 | 
179 |     # Return the root class object
180 |     return namespace.get(root_class_name)
181 | 
```

--------------------------------------------------------------------------------
/marker/config/parser.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import os
  3 | from typing import Dict
  4 | 
  5 | import click
  6 | 
  7 | from marker.converters.pdf import PdfConverter
  8 | from marker.logger import get_logger
  9 | from marker.renderers.chunk import ChunkRenderer
 10 | from marker.renderers.html import HTMLRenderer
 11 | from marker.renderers.json import JSONRenderer
 12 | from marker.renderers.markdown import MarkdownRenderer
 13 | from marker.settings import settings
 14 | from marker.util import classes_to_strings, parse_range_str, strings_to_classes
 15 | 
 16 | logger = get_logger()
 17 | 
 18 | 
 19 | class ConfigParser:
 20 |     def __init__(self, cli_options: dict):
 21 |         self.cli_options = cli_options
 22 | 
 23 |     @staticmethod
 24 |     def common_options(fn):
 25 |         fn = click.option(
 26 |             "--output_dir",
 27 |             type=click.Path(exists=False),
 28 |             required=False,
 29 |             default=settings.OUTPUT_DIR,
 30 |             help="Directory to save output.",
 31 |         )(fn)
 32 |         fn = click.option("--debug", "-d", is_flag=True, help="Enable debug mode.")(fn)
 33 |         fn = click.option(
 34 |             "--output_format",
 35 |             type=click.Choice(["markdown", "json", "html", "chunks"]),
 36 |             default="markdown",
 37 |             help="Format to output results in.",
 38 |         )(fn)
 39 |         fn = click.option(
 40 |             "--processors",
 41 |             type=str,
 42 |             default=None,
 43 |             help="Comma separated list of processors to use.  Must use full module path.",
 44 |         )(fn)
 45 |         fn = click.option(
 46 |             "--config_json",
 47 |             type=str,
 48 |             default=None,
 49 |             help="Path to JSON file with additional configuration.",
 50 |         )(fn)
 51 |         fn = click.option(
 52 |             "--disable_multiprocessing",
 53 |             is_flag=True,
 54 |             default=False,
 55 |             help="Disable multiprocessing.",
 56 |         )(fn)
 57 |         fn = click.option(
 58 |             "--disable_image_extraction",
 59 |             is_flag=True,
 60 |             default=False,
 61 |             help="Disable image extraction.",
 62 |         )(fn)
 63 |         # these are options that need a list transformation, i.e splitting/parsing a string
 64 |         fn = click.option(
 65 |             "--page_range",
 66 |             type=str,
 67 |             default=None,
 68 |             help="Page range to convert, specify comma separated page numbers or ranges.  Example: 0,5-10,20",
 69 |         )(fn)
 70 | 
 71 |         # we put common options here
 72 |         fn = click.option(
 73 |             "--converter_cls",
 74 |             type=str,
 75 |             default=None,
 76 |             help="Converter class to use.  Defaults to PDF converter.",
 77 |         )(fn)
 78 |         fn = click.option(
 79 |             "--llm_service",
 80 |             type=str,
 81 |             default=None,
 82 |             help="LLM service to use - should be full import path, like marker.services.gemini.GoogleGeminiService",
 83 |         )(fn)
 84 |         return fn
 85 | 
 86 |     def generate_config_dict(self) -> Dict[str, any]:
 87 |         config = {}
 88 |         output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR)
 89 |         for k, v in self.cli_options.items():
 90 |             if not v:
 91 |                 continue
 92 | 
 93 |             match k:
 94 |                 case "debug":
 95 |                     config["debug_pdf_images"] = True
 96 |                     config["debug_layout_images"] = True
 97 |                     config["debug_json"] = True
 98 |                     config["debug_data_folder"] = output_dir
 99 |                 case "page_range":
100 |                     config["page_range"] = parse_range_str(v)
101 |                 case "config_json":
102 |                     with open(v, "r", encoding="utf-8") as f:
103 |                         config.update(json.load(f))
104 |                 case "disable_multiprocessing":
105 |                     config["pdftext_workers"] = 1
106 |                 case "disable_image_extraction":
107 |                     config["extract_images"] = False
108 |                 case _:
109 |                     config[k] = v
110 | 
111 |         # Backward compatibility for google_api_key
112 |         if settings.GOOGLE_API_KEY:
113 |             config["gemini_api_key"] = settings.GOOGLE_API_KEY
114 | 
115 |         return config
116 | 
117 |     def get_llm_service(self):
118 |         # Only return an LLM service when use_llm is enabled
119 |         if not self.cli_options.get("use_llm", False):
120 |             return None
121 | 
122 |         service_cls = self.cli_options.get("llm_service", None)
123 |         if service_cls is None:
124 |             service_cls = "marker.services.gemini.GoogleGeminiService"
125 |         return service_cls
126 | 
127 |     def get_renderer(self):
128 |         match self.cli_options["output_format"]:
129 |             case "json":
130 |                 r = JSONRenderer
131 |             case "markdown":
132 |                 r = MarkdownRenderer
133 |             case "html":
134 |                 r = HTMLRenderer
135 |             case "chunks":
136 |                 r = ChunkRenderer
137 |             case _:
138 |                 raise ValueError("Invalid output format")
139 |         return classes_to_strings([r])[0]
140 | 
141 |     def get_processors(self):
142 |         processors = self.cli_options.get("processors", None)
143 |         if processors is not None:
144 |             processors = processors.split(",")
145 |             for p in processors:
146 |                 try:
147 |                     strings_to_classes([p])
148 |                 except Exception as e:
149 |                     logger.error(f"Error loading processor: {p} with error: {e}")
150 |                     raise
151 | 
152 |         return processors
153 | 
154 |     def get_converter_cls(self):
155 |         converter_cls = self.cli_options.get("converter_cls", None)
156 |         if converter_cls is not None:
157 |             try:
158 |                 return strings_to_classes([converter_cls])[0]
159 |             except Exception as e:
160 |                 logger.error(
161 |                     f"Error loading converter: {converter_cls} with error: {e}"
162 |                 )
163 |                 raise
164 | 
165 |         return PdfConverter
166 | 
167 |     def get_output_folder(self, filepath: str):
168 |         output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR)
169 |         fname_base = os.path.splitext(os.path.basename(filepath))[0]
170 |         output_dir = os.path.join(output_dir, fname_base)
171 |         os.makedirs(output_dir, exist_ok=True)
172 |         return output_dir
173 | 
174 |     def get_base_filename(self, filepath: str):
175 |         basename = os.path.basename(filepath)
176 |         return os.path.splitext(basename)[0]
177 | 
```

--------------------------------------------------------------------------------
/marker/builders/layout.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import Annotated, List
  2 | 
  3 | from surya.layout import LayoutPredictor
  4 | from surya.layout.schema import LayoutResult, LayoutBox
  5 | 
  6 | from marker.builders import BaseBuilder
  7 | from marker.providers.pdf import PdfProvider
  8 | from marker.schema import BlockTypes
  9 | from marker.schema.document import Document
 10 | from marker.schema.groups.page import PageGroup
 11 | from marker.schema.polygon import PolygonBox
 12 | from marker.schema.registry import get_block_class
 13 | from marker.settings import settings
 14 | 
 15 | 
 16 | class LayoutBuilder(BaseBuilder):
 17 |     """
 18 |     A builder for performing layout detection on PDF pages and merging the results into the document.
 19 |     """
 20 | 
 21 |     layout_batch_size: Annotated[
 22 |         int,
 23 |         "The batch size to use for the layout model.",
 24 |         "Default is None, which will use the default batch size for the model.",
 25 |     ] = None
 26 |     force_layout_block: Annotated[
 27 |         str,
 28 |         "Skip layout and force every page to be treated as a specific block type.",
 29 |     ] = None
 30 |     disable_tqdm: Annotated[
 31 |         bool,
 32 |         "Disable tqdm progress bars.",
 33 |     ] = False
 34 |     expand_block_types: Annotated[
 35 |         List[BlockTypes],
 36 |         "Block types whose bounds should be expanded to accomodate missing regions",
 37 |     ] = [
 38 |         BlockTypes.Picture,
 39 |         BlockTypes.Figure,
 40 |         BlockTypes.ComplexRegion,
 41 |     ]  # Does not include groups since they are only injected later
 42 |     max_expand_frac: Annotated[
 43 |         float, "The maximum fraction to expand the layout box bounds by"
 44 |     ] = 0.05
 45 | 
 46 |     def __init__(self, layout_model: LayoutPredictor, config=None):
 47 |         self.layout_model = layout_model
 48 | 
 49 |         super().__init__(config)
 50 | 
 51 |     def __call__(self, document: Document, provider: PdfProvider):
 52 |         if self.force_layout_block is not None:
 53 |             # Assign the full content of every page to a single layout type
 54 |             layout_results = self.forced_layout(document.pages)
 55 |         else:
 56 |             layout_results = self.surya_layout(document.pages)
 57 |         self.add_blocks_to_pages(document.pages, layout_results)
 58 |         self.expand_layout_blocks(document)
 59 | 
 60 |     def get_batch_size(self):
 61 |         if self.layout_batch_size is not None:
 62 |             return self.layout_batch_size
 63 |         elif settings.TORCH_DEVICE_MODEL == "cuda":
 64 |             return 12
 65 |         return 6
 66 | 
 67 |     def forced_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
 68 |         layout_results = []
 69 |         for page in pages:
 70 |             layout_results.append(
 71 |                 LayoutResult(
 72 |                     image_bbox=page.polygon.bbox,
 73 |                     bboxes=[
 74 |                         LayoutBox(
 75 |                             label=self.force_layout_block,
 76 |                             position=0,
 77 |                             top_k={self.force_layout_block: 1},
 78 |                             polygon=page.polygon.polygon,
 79 |                         ),
 80 |                     ],
 81 |                     sliced=False,
 82 |                 )
 83 |             )
 84 |         return layout_results
 85 | 
 86 |     def surya_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
 87 |         self.layout_model.disable_tqdm = self.disable_tqdm
 88 |         layout_results = self.layout_model(
 89 |             [p.get_image(highres=False) for p in pages],
 90 |             batch_size=int(self.get_batch_size()),
 91 |         )
 92 |         return layout_results
 93 | 
 94 |     def expand_layout_blocks(self, document: Document):
 95 |         for page in document.pages:
 96 |             # Collect all blocks on this page as PolygonBox for easy access
 97 |             page_blocks = [document.get_block(bid) for bid in page.structure]
 98 |             page_size = page.polygon.size
 99 | 
100 |             for block_id in page.structure:
101 |                 block = document.get_block(block_id)
102 |                 if block.block_type in self.expand_block_types:
103 |                     other_blocks = [b for b in page_blocks if b != block]
104 |                     if not other_blocks:
105 |                         block.polygon = block.polygon.expand(
106 |                             self.max_expand_frac, self.max_expand_frac
107 |                         ).fit_to_bounds((0, 0, *page_size))
108 |                         continue
109 | 
110 |                     min_gap = min(
111 |                         block.polygon.minimum_gap(other.polygon)
112 |                         for other in other_blocks
113 |                     )
114 |                     if min_gap <= 0:
115 |                         continue
116 | 
117 |                     x_expand_frac = (
118 |                         min_gap / block.polygon.width if block.polygon.width > 0 else 0
119 |                     )
120 |                     y_expand_frac = (
121 |                         min_gap / block.polygon.height
122 |                         if block.polygon.height > 0
123 |                         else 0
124 |                     )
125 | 
126 |                     block.polygon = block.polygon.expand(
127 |                         min(self.max_expand_frac, x_expand_frac),
128 |                         min(self.max_expand_frac, y_expand_frac),
129 |                     ).fit_to_bounds((0, 0, *page_size))
130 | 
131 |     def add_blocks_to_pages(
132 |         self, pages: List[PageGroup], layout_results: List[LayoutResult]
133 |     ):
134 |         for page, layout_result in zip(pages, layout_results):
135 |             layout_page_size = PolygonBox.from_bbox(layout_result.image_bbox).size
136 |             provider_page_size = page.polygon.size
137 |             page.layout_sliced = (
138 |                 layout_result.sliced
139 |             )  # This indicates if the page was sliced by the layout model
140 |             for bbox in sorted(layout_result.bboxes, key=lambda x: x.position):
141 |                 block_cls = get_block_class(BlockTypes[bbox.label])
142 |                 layout_block = page.add_block(
143 |                     block_cls, PolygonBox(polygon=bbox.polygon)
144 |                 )
145 |                 layout_block.polygon = layout_block.polygon.rescale(
146 |                     layout_page_size, provider_page_size
147 |                 ).fit_to_bounds((0, 0, *provider_page_size))
148 |                 layout_block.top_k = {
149 |                     BlockTypes[label]: prob
150 |                     for (label, prob) in bbox.top_k.items()
151 |                     if label in BlockTypes.__members__
152 |                 }
153 |                 page.add_structure(layout_block)
154 | 
155 |             # Ensure page has non-empty structure
156 |             if page.structure is None:
157 |                 page.structure = []
158 | 
159 |             # Ensure page has non-empty children
160 |             if page.children is None:
161 |                 page.children = []
162 | 
```

--------------------------------------------------------------------------------
/marker/scripts/convert.py:
--------------------------------------------------------------------------------

```python
  1 | import atexit
  2 | import os
  3 | import time
  4 | 
  5 | import psutil
  6 | import torch
  7 | 
  8 | from marker.utils.batch import get_batch_sizes_worker_counts
  9 | 
 10 | # Ensure threads don't contend
 11 | os.environ["MKL_DYNAMIC"] = "FALSE"
 12 | os.environ["OMP_DYNAMIC"] = "FALSE"
 13 | os.environ["OMP_NUM_THREADS"] = "2"  # Avoid OpenMP issues with multiprocessing
 14 | os.environ["OPENBLAS_NUM_THREADS"] = "2"
 15 | os.environ["MKL_NUM_THREADS"] = "2"
 16 | os.environ["GRPC_VERBOSITY"] = "ERROR"
 17 | os.environ["GLOG_minloglevel"] = "2"
 18 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = (
 19 |     "1"  # Transformers uses .isin for a simple op, which is not supported on MPS
 20 | )
 21 | os.environ["IN_STREAMLIT"] = "true"  # Avoid multiprocessing inside surya
 22 | 
 23 | import math
 24 | import traceback
 25 | 
 26 | import click
 27 | import torch.multiprocessing as mp
 28 | from tqdm import tqdm
 29 | import gc
 30 | 
 31 | from marker.config.parser import ConfigParser
 32 | from marker.config.printer import CustomClickPrinter
 33 | from marker.logger import configure_logging, get_logger
 34 | from marker.models import create_model_dict
 35 | from marker.output import output_exists, save_output
 36 | from marker.utils.gpu import GPUManager
 37 | 
 38 | configure_logging()
 39 | logger = get_logger()
 40 | 
 41 | 
 42 | def worker_init():
 43 |     model_dict = create_model_dict()
 44 | 
 45 |     global model_refs
 46 |     model_refs = model_dict
 47 | 
 48 |     # Ensure we clean up the model references on exit
 49 |     atexit.register(worker_exit)
 50 | 
 51 | 
 52 | def worker_exit():
 53 |     global model_refs
 54 |     try:
 55 |         del model_refs
 56 |     except Exception:
 57 |         pass
 58 | 
 59 | 
 60 | def process_single_pdf(args):
 61 |     page_count = 0
 62 |     fpath, cli_options = args
 63 |     torch.set_num_threads(cli_options["total_torch_threads"])
 64 |     del cli_options["total_torch_threads"]
 65 | 
 66 |     config_parser = ConfigParser(cli_options)
 67 | 
 68 |     out_folder = config_parser.get_output_folder(fpath)
 69 |     base_name = config_parser.get_base_filename(fpath)
 70 |     if cli_options.get("skip_existing") and output_exists(out_folder, base_name):
 71 |         return page_count
 72 | 
 73 |     converter_cls = config_parser.get_converter_cls()
 74 |     config_dict = config_parser.generate_config_dict()
 75 |     config_dict["disable_tqdm"] = True
 76 | 
 77 |     try:
 78 |         if cli_options.get("debug_print"):
 79 |             logger.debug(f"Converting {fpath}")
 80 |         converter = converter_cls(
 81 |             config=config_dict,
 82 |             artifact_dict=model_refs,
 83 |             processor_list=config_parser.get_processors(),
 84 |             renderer=config_parser.get_renderer(),
 85 |             llm_service=config_parser.get_llm_service(),
 86 |         )
 87 |         rendered = converter(fpath)
 88 |         out_folder = config_parser.get_output_folder(fpath)
 89 |         save_output(rendered, out_folder, base_name)
 90 |         page_count = converter.page_count
 91 | 
 92 |         if cli_options.get("debug_print"):
 93 |             logger.debug(f"Converted {fpath}")
 94 |         del rendered
 95 |         del converter
 96 |     except Exception as e:
 97 |         logger.error(f"Error converting {fpath}: {e}")
 98 |         traceback.print_exc()
 99 |     finally:
100 |         gc.collect()
101 | 
102 |     return page_count
103 | 
104 | 
105 | @click.command(cls=CustomClickPrinter)
106 | @click.argument("in_folder", type=str)
107 | @click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
108 | @click.option(
109 |     "--num_chunks",
110 |     type=int,
111 |     default=1,
112 |     help="Number of chunks being processed in parallel",
113 | )
114 | @click.option(
115 |     "--max_files", type=int, default=None, help="Maximum number of pdfs to convert"
116 | )
117 | @click.option(
118 |     "--skip_existing",
119 |     is_flag=True,
120 |     default=False,
121 |     help="Skip existing converted files.",
122 | )
123 | @click.option(
124 |     "--debug_print", is_flag=True, default=False, help="Print debug information."
125 | )
126 | @click.option(
127 |     "--max_tasks_per_worker",
128 |     type=int,
129 |     default=10,
130 |     help="Maximum number of tasks per worker process before recycling.",
131 | )
132 | @click.option(
133 |     "--workers",
134 |     type=int,
135 |     default=None,
136 |     help="Number of worker processes to use.  Set automatically by default, but can be overridden.",
137 | )
138 | @ConfigParser.common_options
139 | def convert_cli(in_folder: str, **kwargs):
140 |     total_pages = 0
141 |     in_folder = os.path.abspath(in_folder)
142 |     files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
143 |     files = [f for f in files if os.path.isfile(f)]
144 | 
145 |     # Handle chunks if we're processing in parallel
146 |     # Ensure we get all files into a chunk
147 |     chunk_size = math.ceil(len(files) / kwargs["num_chunks"])
148 |     start_idx = kwargs["chunk_idx"] * chunk_size
149 |     end_idx = start_idx + chunk_size
150 |     files_to_convert = files[start_idx:end_idx]
151 | 
152 |     # Limit files converted if needed
153 |     if kwargs["max_files"]:
154 |         files_to_convert = files_to_convert[: kwargs["max_files"]]
155 | 
156 |     # Disable nested multiprocessing
157 |     kwargs["disable_multiprocessing"] = True
158 | 
159 |     try:
160 |         mp.set_start_method("spawn")  # Required for CUDA, forkserver doesn't work
161 |     except RuntimeError:
162 |         raise RuntimeError(
163 |             "Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again."
164 |         )
165 | 
166 |     chunk_idx = kwargs["chunk_idx"]
167 | 
168 |     # Use GPU context manager for automatic setup/cleanup
169 |     with GPUManager(chunk_idx) as gpu_manager:
170 |         batch_sizes, workers = get_batch_sizes_worker_counts(gpu_manager, 7)
171 | 
172 |         # Override workers if specified
173 |         if kwargs["workers"] is not None:
174 |             workers = kwargs["workers"]
175 | 
176 |         # Set proper batch sizes and thread counts
177 |         total_processes = max(1, min(len(files_to_convert), workers))
178 |         kwargs["total_torch_threads"] = max(
179 |             2, psutil.cpu_count(logical=False) // total_processes
180 |         )
181 |         kwargs.update(batch_sizes)
182 | 
183 |         logger.info(
184 |             f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}"
185 |         )
186 |         task_args = [(f, kwargs) for f in files_to_convert]
187 | 
188 |         start_time = time.time()
189 |         with mp.Pool(
190 |             processes=total_processes,
191 |             initializer=worker_init,
192 |             maxtasksperchild=kwargs["max_tasks_per_worker"],
193 |         ) as pool:
194 |             pbar = tqdm(total=len(task_args), desc="Processing PDFs", unit="pdf")
195 |             for page_count in pool.imap_unordered(process_single_pdf, task_args):
196 |                 pbar.update(1)
197 |                 total_pages += page_count
198 |             pbar.close()
199 | 
200 |         total_time = time.time() - start_time
201 |         print(
202 |             f"Inferenced {total_pages} pages in {total_time:.2f} seconds, for a throughput of {total_pages / total_time:.2f} pages/sec for chunk {chunk_idx + 1}/{kwargs['num_chunks']}"
203 |         )
204 | 
```

--------------------------------------------------------------------------------
/marker/processors/llm/__init__.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import traceback
  3 | from concurrent.futures import ThreadPoolExecutor, as_completed
  4 | from typing import Annotated, TypedDict, List, Sequence
  5 | 
  6 | from pydantic import BaseModel
  7 | from tqdm import tqdm
  8 | from PIL import Image
  9 | 
 10 | from marker.output import json_to_html
 11 | from marker.processors import BaseProcessor
 12 | from marker.schema import BlockTypes
 13 | from marker.schema.blocks import Block, BlockId
 14 | from marker.schema.document import Document
 15 | from marker.schema.groups import PageGroup
 16 | from marker.services import BaseService
 17 | from marker.util import assign_config
 18 | from marker.logger import get_logger
 19 | 
 20 | logger = get_logger()
 21 | 
 22 | 
 23 | class PromptData(TypedDict):
 24 |     prompt: str
 25 |     image: Image.Image
 26 |     block: Block
 27 |     schema: BaseModel
 28 |     page: PageGroup
 29 |     additional_data: dict | None
 30 | 
 31 | 
 32 | class BlockData(TypedDict):
 33 |     page: PageGroup
 34 |     block: Block
 35 | 
 36 | 
 37 | class BaseLLMProcessor(BaseProcessor):
 38 |     """
 39 |     A processor for using LLMs to convert blocks.
 40 |     """
 41 | 
 42 |     max_concurrency: Annotated[
 43 |         int,
 44 |         "The maximum number of concurrent requests to make to the Gemini model.",
 45 |     ] = 3
 46 |     image_expansion_ratio: Annotated[
 47 |         float,
 48 |         "The ratio to expand the image by when cropping.",
 49 |     ] = 0.01
 50 |     use_llm: Annotated[
 51 |         bool,
 52 |         "Whether to use the LLM model.",
 53 |     ] = False
 54 |     disable_tqdm: Annotated[
 55 |         bool,
 56 |         "Whether to disable the tqdm progress bar.",
 57 |     ] = False
 58 |     block_types = None
 59 | 
 60 |     def __init__(self, llm_service: BaseService, config=None):
 61 |         super().__init__(config)
 62 | 
 63 |         self.llm_service = None
 64 |         if not self.use_llm:
 65 |             return
 66 | 
 67 |         self.llm_service = llm_service
 68 | 
 69 |     def extract_image(
 70 |         self,
 71 |         document: Document,
 72 |         image_block: Block,
 73 |         remove_blocks: Sequence[BlockTypes] | None = None,
 74 |     ) -> Image.Image:
 75 |         return image_block.get_image(
 76 |             document,
 77 |             highres=True,
 78 |             expansion=(self.image_expansion_ratio, self.image_expansion_ratio),
 79 |             remove_blocks=remove_blocks,
 80 |         )
 81 | 
 82 |     def normalize_block_json(self, block: Block, document: Document, page: PageGroup):
 83 |         """
 84 |         Get the normalized JSON representation of a block for the LLM.
 85 |         """
 86 |         page_width = page.polygon.width
 87 |         page_height = page.polygon.height
 88 |         block_bbox = block.polygon.bbox
 89 | 
 90 |         # Normalize bbox to 0-1000 range
 91 |         normalized_bbox = [
 92 |             (block_bbox[0] / page_width) * 1000,
 93 |             (block_bbox[1] / page_height) * 1000,
 94 |             (block_bbox[2] / page_width) * 1000,
 95 |             (block_bbox[3] / page_height) * 1000,
 96 |         ]
 97 | 
 98 |         block_json = {
 99 |             "id": str(block.id),
100 |             "block_type": str(block.id.block_type),
101 |             "bbox": normalized_bbox,
102 |             "html": json_to_html(block.render(document)),
103 |         }
104 | 
105 |         return block_json
106 | 
107 |     def load_blocks(self, response: dict):
108 |         return [json.loads(block) for block in response["blocks"]]
109 | 
110 |     def handle_rewrites(self, blocks: list, document: Document):
111 |         for block_data in blocks:
112 |             try:
113 |                 block_id = block_data["id"].strip().lstrip("/")
114 |                 _, page_id, block_type, block_id = block_id.split("/")
115 |                 block_id = BlockId(
116 |                     page_id=page_id,
117 |                     block_id=block_id,
118 |                     block_type=getattr(BlockTypes, block_type),
119 |                 )
120 |                 block = document.get_block(block_id)
121 |                 if not block:
122 |                     logger.debug(f"Block {block_id} not found in document")
123 |                     continue
124 | 
125 |                 if hasattr(block, "html"):
126 |                     block.html = block_data["html"]
127 |             except Exception as e:
128 |                 logger.debug(f"Error parsing block ID {block_data['id']}: {e}")
129 |                 continue
130 | 
131 | 
132 | class BaseLLMComplexBlockProcessor(BaseLLMProcessor):
133 |     """
134 |     A processor for using LLMs to convert blocks with more complex logic.
135 |     """
136 | 
137 |     def __call__(self, document: Document):
138 |         if not self.use_llm or self.llm_service is None:
139 |             return
140 | 
141 |         try:
142 |             self.rewrite_blocks(document)
143 |         except Exception as e:
144 |             logger.warning(f"Error rewriting blocks in {self.__class__.__name__}: {e}")
145 | 
146 |     def process_rewriting(self, document: Document, page: PageGroup, block: Block):
147 |         raise NotImplementedError()
148 | 
149 |     def rewrite_blocks(self, document: Document):
150 |         # Don't show progress if there are no blocks to process
151 |         total_blocks = sum(
152 |             len(page.contained_blocks(document, self.block_types))
153 |             for page in document.pages
154 |         )
155 |         if total_blocks == 0:
156 |             return
157 | 
158 |         pbar = tqdm(
159 |             total=total_blocks,
160 |             desc=f"{self.__class__.__name__} running",
161 |             disable=self.disable_tqdm
162 |         )
163 |         with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
164 |             for future in as_completed(
165 |                 [
166 |                     executor.submit(self.process_rewriting, document, page, block)
167 |                     for page in document.pages
168 |                     for block in page.contained_blocks(document, self.block_types)
169 |                 ]
170 |             ):
171 |                 future.result()  # Raise exceptions if any occurred
172 |                 pbar.update(1)
173 | 
174 |         pbar.close()
175 | 
176 | 
177 | class BaseLLMSimpleBlockProcessor(BaseLLMProcessor):
178 |     """
179 |     A processor for using LLMs to convert single blocks.
180 |     """
181 | 
182 |     # Override init since we don't need an llmservice here
183 |     def __init__(self, config=None):
184 |         assign_config(self, config)
185 | 
186 |     def __call__(self, result: dict, prompt_data: PromptData, document: Document):
187 |         try:
188 |             self.rewrite_block(result, prompt_data, document)
189 |         except Exception as e:
190 |             logger.warning(f"Error rewriting block in {self.__class__.__name__}: {e}")
191 |             traceback.print_exc()
192 | 
193 |     def inference_blocks(self, document: Document) -> List[BlockData]:
194 |         blocks = []
195 |         for page in document.pages:
196 |             for block in page.contained_blocks(document, self.block_types):
197 |                 blocks.append({"page": page, "block": block})
198 |         return blocks
199 | 
200 |     def block_prompts(self, document: Document) -> List[PromptData]:
201 |         raise NotImplementedError()
202 | 
203 |     def rewrite_block(
204 |         self, response: dict, prompt_data: PromptData, document: Document
205 |     ):
206 |         raise NotImplementedError()
207 | 
```

--------------------------------------------------------------------------------
/benchmarks/overall/scorers/llm.py:
--------------------------------------------------------------------------------

```python
  1 | import json
  2 | import os
  3 | import tempfile
  4 | import time
  5 | from typing import List
  6 | 
  7 | from PIL import Image
  8 | from google.genai.errors import APIError
  9 | from google import genai
 10 | import pypdfium2 as pdfium
 11 | 
 12 | from benchmarks.overall.scorers import BaseScorer, BlockScores
 13 | from marker.settings import settings
 14 | 
 15 | rating_prompt = """
 16 | You're a document analysis expert who is comparing some markdown to an image to make sure the markdown is correct. You're rating how effectively the provided markdown represents the full text and formatting in the image provided.
 17 | You're given an image, along with the extracted markdown:
 18 | - Some parts of the page may have been recognized as images and linked from the markdown, like `![](_page_0_Picture_0.jpeg)`.
 19 | - Tables will be formatted as Github flavored markdown.
 20 | - Block equations will be in LaTeX.
 21 | - The image and markdown may be in any language.
 22 | - The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text.
 23 | 
 24 | The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided.
 25 | 
 26 | **Instructions**
 27 | Follow this process to evaluate the markdown:
 28 | 1. Carefully examine the image.
 29 | 2. Carefully examine the markdown input provided.
 30 | 3. Compare the image to the markdown representation.  Does the markdown representation properly represent the important text and formatting in the image?
 31 | 4. Assign component scores, as described below.
 32 | 
 33 | These are the primary scores:
 34 | - Overall - the overall quality of the markdown as compared to the image.
 35 | - Text quality - the quality of the text extraction from the image.
 36 | - Formatting quality - the quality of the formatting applied to the markdown, as compared to the image.
 37 | 
 38 | Depending on which elements are present in the markdown, you will assign element-specific scores.
 39 | - Tables - how effectively the tables have been extracted and formatted.
 40 | - Forms - how effectively the forms have extracted and formatted.
 41 | - Equations - how effectively block equations have been converted to LaTeX.
 42 | - Section headers - if all of the section headers have been detected, and the right levels set.
 43 | - Lists - if the lists have been properly extracted and formatted.
 44 | - Images - if images are identified and placed correctly.
 45 | 
 46 | Notes on scoring:
 47 | - To get a 5/5, all of the important text from the image must appear in the markdown, and the formatting should be correct (minor mistakes okay).  It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings.  If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text.
 48 | - A 3/5 may have small missing text elements from the markdown and/or moderate formatting issues.
 49 | - A 1/5 will have major missing text segments from the markdown or completely unreadable formatting.
 50 | - Use 0/5 if a field isn't applicable, like if the image doesn't contain a table.
 51 | 
 52 | If text that is important to the meaning of the document is missing, do not score higher than 3/5.
 53 | 
 54 | Output json, like in the example below.
 55 | 
 56 | **Example**
 57 | Input
 58 | ```markdown
 59 | # Section 1
 60 | This is some *markdown* extracted from a document.  Here is a block equation:
 61 | $$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$
 62 | ```
 63 | Output
 64 | ```json
 65 | {
 66 |     "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.",
 67 |     "markdown_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.",
 68 |     "comparison": "The text and formatting matches the image.  There are no formatting or text extraction issues.  The equations and section headers are correct.",
 69 |     "overall": 5,
 70 |     "text": 5,
 71 |     "formatting": 5,
 72 |     "section_headers": 5,
 73 | 	"tables": 0,
 74 | 	"forms": 0,
 75 |     "equations": 5,
 76 | 	"lists": 0,
 77 | 	"images": 0
 78 | }
 79 | ```
 80 | **Input**
 81 | ```markdown
 82 | {{markdown}}
 83 | ```
 84 | **Output**
 85 | """
 86 | 
 87 | comparison_keys = ["comparison"]
 88 | description_keys = ["image_description", "markdown_description"]
 89 | text_keys = comparison_keys + description_keys
 90 | score_keys = ["overall", "text", "formatting", "section_headers", "tables", "forms", "equations",
 91 |             "lists", "images"]
 92 | 
 93 | 
 94 | class LLMScorer(BaseScorer):
 95 |     def __call__(self, sample, gt_markdown: List[str], markdown: str) -> BlockScores:
 96 |         pdf_bytes = sample["pdf"]
 97 |         with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
 98 |             f.write(pdf_bytes)
 99 |             f.flush()
100 |             f.seek(0)
101 |             doc = pdfium.PdfDocument(f.name)
102 |             img = doc[0].render(scale=96/72).to_pil()
103 |             doc.close()
104 | 
105 |         return self.llm_rater(img, markdown)
106 | 
107 | 
108 |     def llm_rater(self, img: Image.Image, markdown: str) -> BlockScores:
109 |         if not markdown:
110 |             null_scores = {k: 1 for k in score_keys}
111 |             text_scores = {k: "" for k in text_keys}
112 |             null_scores.update(text_scores)
113 |             return {
114 |                 "score": 1,
115 |                 "specific_scores": null_scores
116 |             }
117 |         req_keys = text_keys + score_keys
118 |         properties = {}
119 |         for key in req_keys:
120 |             content_type = "INTEGER" if key in score_keys else "STRING"
121 |             properties[key] = {"type": content_type}
122 | 
123 |         response_schema = {
124 |             "required": req_keys,
125 |             "properties": properties,
126 |             "type": "OBJECT"
127 |         }
128 |         prompt = rating_prompt.replace("{{markdown}}", markdown)
129 |         response = self.llm_response_wrapper([img, prompt], response_schema)
130 |         assert all([k in response for k in req_keys]), f"Missing keys in response: {response}"
131 |         return {
132 |             "score": response["overall"],
133 |             "specific_scores": response,
134 |         }
135 | 
136 |     def llm_response_wrapper(self, prompt, response_schema, depth=0):
137 |         client = genai.Client(
138 |             http_options={"timeout": 60000},
139 |             vertexai=True,
140 |             project=os.getenv("VERTEX_PROJECT_ID"),
141 |             location=os.getenv("VERTEX_LOCATION"),
142 |         )
143 |         try:
144 |             responses = client.models.generate_content(
145 |                 model="gemini-2.0-flash-001",
146 |                 contents=prompt,
147 |                 config={
148 |                     "temperature": 0,
149 |                     "response_schema": response_schema,
150 |                     "response_mime_type": "application/json",
151 |                 },
152 |             )
153 |             output = responses.candidates[0].content.parts[0].text
154 |             return json.loads(output)
155 |         except APIError as e:
156 |             print(f"Hit Gemini rate limit, waiting 120 seconds")
157 |             time.sleep(120)
158 |             if depth > 2:
159 |                 raise e
160 |             return self.llm_response_wrapper(prompt, response_schema, depth + 1)
```

--------------------------------------------------------------------------------
/tests/processors/test_llm_processors.py:
--------------------------------------------------------------------------------

```python
  1 | from unittest.mock import MagicMock, Mock
  2 | 
  3 | import pytest
  4 | from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
  5 | from marker.processors.llm.llm_equation import LLMEquationProcessor
  6 | 
  7 | from marker.processors.llm.llm_form import LLMFormProcessor
  8 | from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor
  9 | from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor
 10 | from marker.processors.llm.llm_table import LLMTableProcessor
 11 | from marker.processors.table import TableProcessor
 12 | from marker.renderers.markdown import MarkdownRenderer
 13 | from marker.schema import BlockTypes
 14 | from marker.schema.blocks import ComplexRegion
 15 | 
 16 | 
 17 | @pytest.mark.filename("form_1040.pdf")
 18 | @pytest.mark.config({"page_range": [0]})
 19 | def test_llm_form_processor_no_config(pdf_document, llm_service):
 20 |     processor_lst = [LLMFormProcessor()]
 21 |     processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service)
 22 |     processor(pdf_document)
 23 | 
 24 |     forms = pdf_document.contained_blocks((BlockTypes.Form,))
 25 |     assert forms[0].html is None
 26 | 
 27 | 
 28 | @pytest.mark.filename("form_1040.pdf")
 29 | @pytest.mark.config({"page_range": [0]})
 30 | def test_llm_form_processor_no_cells(pdf_document, llm_service):
 31 |     config = {"use_llm": True, "gemini_api_key": "test"}
 32 |     processor_lst = [LLMFormProcessor(config)]
 33 |     processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service, config)
 34 |     processor(pdf_document)
 35 | 
 36 |     forms = pdf_document.contained_blocks((BlockTypes.Form,))
 37 |     assert forms[0].html is None
 38 | 
 39 | 
 40 | @pytest.mark.filename("form_1040.pdf")
 41 | @pytest.mark.config({"page_range": [0]})
 42 | def test_llm_form_processor(pdf_document, table_rec_model, recognition_model, detection_model):
 43 |     corrected_html = "<em>This is corrected markdown.</em>\n" * 100
 44 |     corrected_html = "<p>" + corrected_html.strip() + "</p>\n"
 45 | 
 46 |     mock_cls = Mock()
 47 |     mock_cls.return_value = {"corrected_html": corrected_html}
 48 | 
 49 |     cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model)
 50 |     cell_processor(pdf_document)
 51 | 
 52 |     config = {"use_llm": True, "gemini_api_key": "test"}
 53 |     processor_lst = [LLMFormProcessor(config)]
 54 |     processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
 55 |     processor(pdf_document)
 56 | 
 57 |     forms = pdf_document.contained_blocks((BlockTypes.Form,))
 58 |     assert forms[0].html == corrected_html.strip()
 59 | 
 60 | 
 61 | 
 62 | @pytest.mark.filename("table_ex2.pdf")
 63 | @pytest.mark.config({"page_range": [0]})
 64 | def test_llm_table_processor(pdf_document, table_rec_model, recognition_model, detection_model):
 65 |     corrected_html = """
 66 | <table>
 67 |     <tr>
 68 |         <td>Column 1</td>
 69 |         <td>Column 2</td>
 70 |         <td>Column 3</td>
 71 |         <td>Column 4</td>
 72 |     </tr>
 73 |     <tr>
 74 |         <td>Value 1 <math>x</math></td>
 75 |         <td>Value 2</td>
 76 |         <td>Value 3</td>
 77 |         <td>Value 4</td>
 78 |     </tr>
 79 |     <tr>
 80 |         <td>Value 5</td>
 81 |         <td>Value 6</td>
 82 |         <td>Value 7</td>
 83 |         <td>Value 8</td>
 84 |     </tr>
 85 | </table>
 86 |     """.strip()
 87 | 
 88 |     mock_cls = Mock()
 89 |     mock_cls.return_value = {"corrected_html": corrected_html}
 90 | 
 91 |     cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model)
 92 |     cell_processor(pdf_document)
 93 | 
 94 |     processor = LLMTableProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"})
 95 |     processor(pdf_document)
 96 | 
 97 |     tables = pdf_document.contained_blocks((BlockTypes.Table,))
 98 |     table_cells = tables[0].contained_blocks(pdf_document, (BlockTypes.TableCell,))
 99 |     assert table_cells[0].text == "Column 1"
100 | 
101 |     markdown = MarkdownRenderer()(pdf_document).markdown
102 |     assert "Value 1 $x$" in markdown
103 | 
104 | 
105 | @pytest.mark.filename("A17_FlightPlan.pdf")
106 | @pytest.mark.config({"page_range": [0]})
107 | def test_llm_caption_processor_disabled(pdf_document):
108 |     config = {"use_llm": True, "gemini_api_key": "test"}
109 |     mock_cls = MagicMock()
110 |     processor_lst = [LLMImageDescriptionProcessor(config)]
111 |     processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
112 |     processor(pdf_document)
113 | 
114 |     contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure))
115 |     assert all(picture.description is None for picture in contained_pictures)
116 | 
117 | @pytest.mark.filename("A17_FlightPlan.pdf")
118 | @pytest.mark.config({"page_range": [0]})
119 | def test_llm_caption_processor(pdf_document):
120 |     description = "This is an image description."
121 |     mock_cls = Mock()
122 |     mock_cls.return_value = {"image_description": description}
123 | 
124 |     config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False}
125 |     processor_lst = [LLMImageDescriptionProcessor(config)]
126 |     processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
127 |     processor(pdf_document)
128 | 
129 |     contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure))
130 |     assert all(picture.description == description for picture in contained_pictures)
131 | 
132 |     # Ensure the rendering includes the description
133 |     renderer = MarkdownRenderer({"extract_images": False})
134 |     md = renderer(pdf_document).markdown
135 | 
136 |     assert description in md
137 | 
138 | 
139 | @pytest.mark.filename("A17_FlightPlan.pdf")
140 | @pytest.mark.config({"page_range": [0]})
141 | def test_llm_complex_region_processor(pdf_document):
142 |     md = "This is some *markdown* for a complex region."
143 |     mock_cls = Mock()
144 |     mock_cls.return_value = {"corrected_markdown": md * 25}
145 | 
146 |     # Replace the block with a complex region
147 |     old_block = pdf_document.pages[0].children[0]
148 |     new_block = ComplexRegion(
149 |         **old_block.dict(exclude=["id", "block_id", "block_type"]),
150 |     )
151 |     pdf_document.pages[0].replace_block(old_block, new_block)
152 | 
153 |     # Test processor
154 |     config = {"use_llm": True, "gemini_api_key": "test"}
155 |     processor_lst = [LLMComplexRegionProcessor(config)]
156 |     processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
157 |     processor(pdf_document)
158 | 
159 |     # Ensure the rendering includes the description
160 |     renderer = MarkdownRenderer()
161 |     rendered_md = renderer(pdf_document).markdown
162 | 
163 |     assert md in rendered_md
164 | 
165 | @pytest.mark.filename("adversarial.pdf")
166 | @pytest.mark.config({"page_range": [0]})
167 | def test_multi_llm_processors(pdf_document):
168 |     description = "<math>This is an image description.  And here is a lot of writing about it.</math>" * 10
169 |     mock_cls = Mock()
170 |     mock_cls.return_value = {"image_description": description, "corrected_equation": description}
171 | 
172 |     config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False, "min_equation_height": .001}
173 |     processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)]
174 |     processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
175 |     processor(pdf_document)
176 | 
177 |     contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure))
178 |     assert all(picture.description == description for picture in contained_pictures)
179 | 
180 |     contained_equations = pdf_document.contained_blocks((BlockTypes.Equation,))
181 |     print([equation.html for equation in contained_equations])
182 |     assert all(equation.html == description for equation in contained_equations)
```

--------------------------------------------------------------------------------
/benchmarks/table/inference.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import List
  2 | 
  3 | import numpy as np
  4 | from bs4 import BeautifulSoup
  5 | import pypdfium2 as pdfium
  6 | from tqdm import tqdm
  7 | import base64
  8 | import tempfile
  9 | 
 10 | from benchmarks.table.gemini import gemini_table_rec
 11 | from marker.config.parser import ConfigParser
 12 | from marker.converters.table import TableConverter
 13 | from marker.models import create_model_dict
 14 | from marker.processors.llm.llm_table import LLMTableProcessor
 15 | from marker.processors.table import TableProcessor
 16 | from marker.renderers.json import JSONBlockOutput
 17 | from marker.schema.polygon import PolygonBox
 18 | from marker.util import matrix_intersection_area
 19 | 
 20 | 
 21 | def extract_tables(children: List[JSONBlockOutput]):
 22 |     tables = []
 23 |     for child in children:
 24 |         if child.block_type == 'Table':
 25 |             tables.append(child)
 26 |         elif child.children:
 27 |             tables.extend(extract_tables(child.children))
 28 |     return tables
 29 | 
 30 | def fix_table_html(table_html: str) -> str:
 31 |     marker_table_soup = BeautifulSoup(table_html, 'html.parser')
 32 |     tbody = marker_table_soup.find('tbody')
 33 |     if tbody:
 34 |         tbody.unwrap()
 35 |     for th_tag in marker_table_soup.find_all('th'):
 36 |         th_tag.name = 'td'
 37 |     for br_tag in marker_table_soup.find_all('br'):
 38 |         br_tag.replace_with(marker_table_soup.new_string(''))
 39 | 
 40 |     marker_table_html = str(marker_table_soup)
 41 |     marker_table_html = marker_table_html.replace("\n", " ")  # Fintabnet uses spaces instead of newlines
 42 |     return marker_table_html
 43 | 
 44 | 
 45 | def inference_tables(dataset, use_llm: bool, table_rec_batch_size: int | None, max_rows: int, use_gemini: bool):
 46 |     models = create_model_dict()
 47 |     config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True})
 48 |     total_unaligned = 0
 49 |     results = []
 50 | 
 51 |     iterations = len(dataset)
 52 |     if max_rows is not None:
 53 |         iterations = min(max_rows, len(dataset))
 54 | 
 55 |     for i in tqdm(range(iterations), desc='Converting Tables'):
 56 |         try:
 57 |             row = dataset[i]
 58 |             pdf_binary = base64.b64decode(row['pdf'])
 59 |             gt_tables = row['tables']  # Already sorted by reading order, which is what marker returns
 60 | 
 61 |             # Only use the basic table processors
 62 |             converter = TableConverter(
 63 |                 config=config_parser.generate_config_dict(),
 64 |                 artifact_dict=models,
 65 |                 processor_list=[
 66 |                     "marker.processors.table.TableProcessor",
 67 |                     "marker.processors.llm.llm_table.LLMTableProcessor",
 68 |                 ],
 69 |                 renderer=config_parser.get_renderer()
 70 |             )
 71 | 
 72 |             with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
 73 |                 temp_pdf_file.write(pdf_binary)
 74 |                 temp_pdf_file.seek(0)
 75 |                 marker_json = converter(temp_pdf_file.name).children
 76 | 
 77 |                 doc = pdfium.PdfDocument(temp_pdf_file.name)
 78 |                 page_image = doc[0].render(scale=96/72).to_pil()
 79 |                 doc.close()
 80 | 
 81 |             if len(marker_json) == 0 or len(gt_tables) == 0:
 82 |                 print(f'No tables detected, skipping...')
 83 |                 total_unaligned += len(gt_tables)
 84 |                 continue
 85 | 
 86 |             marker_tables = extract_tables(marker_json)
 87 |             marker_table_boxes = [table.bbox for table in marker_tables]
 88 |             page_bbox = marker_json[0].bbox
 89 | 
 90 |             if len(marker_tables) != len(gt_tables):
 91 |                 print(f'Number of tables do not match, skipping...')
 92 |                 total_unaligned += len(gt_tables)
 93 |                 continue
 94 | 
 95 |             table_images = [
 96 |                 page_image.crop(
 97 |                     PolygonBox.from_bbox(bbox)
 98 |                     .rescale(
 99 |                         (page_bbox[2], page_bbox[3]), (page_image.width, page_image.height)
100 |                     ).bbox
101 |                 )
102 |                 for bbox
103 |                 in marker_table_boxes
104 |             ]
105 | 
106 |             # Normalize the bboxes
107 |             for bbox in marker_table_boxes:
108 |                 bbox[0] = bbox[0] / page_bbox[2]
109 |                 bbox[1] = bbox[1] / page_bbox[3]
110 |                 bbox[2] = bbox[2] / page_bbox[2]
111 |                 bbox[3] = bbox[3] / page_bbox[3]
112 | 
113 |             gt_boxes = [table['normalized_bbox'] for table in gt_tables]
114 |             gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes]
115 |             marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes]
116 |             table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes)
117 | 
118 |             aligned_tables = []
119 |             used_tables = set()
120 |             unaligned_tables = set()
121 |             for table_idx, alignment in enumerate(table_alignments):
122 |                 try:
123 |                     max_area = np.max(alignment)
124 |                     aligned_idx = np.argmax(alignment)
125 |                 except ValueError:
126 |                     # No alignment found
127 |                     unaligned_tables.add(table_idx)
128 |                     continue
129 | 
130 |                 if max_area <= .01:
131 |                     # No alignment found
132 |                     unaligned_tables.add(table_idx)
133 |                     continue
134 | 
135 |                 if aligned_idx in used_tables:
136 |                     # Marker table already aligned with another gt table
137 |                     unaligned_tables.add(table_idx)
138 |                     continue
139 | 
140 |                 # Gt table doesn't align well with any marker table
141 |                 gt_table_pct = gt_areas[table_idx] / max_area
142 |                 if not .85 < gt_table_pct < 1.15:
143 |                     unaligned_tables.add(table_idx)
144 |                     continue
145 | 
146 |                 # Marker table doesn't align with gt table
147 |                 marker_table_pct = marker_areas[aligned_idx] / max_area
148 |                 if not .85 < marker_table_pct < 1.15:
149 |                     unaligned_tables.add(table_idx)
150 |                     continue
151 | 
152 |                 gemini_html = ""
153 |                 if use_gemini:
154 |                     try:
155 |                         gemini_html = gemini_table_rec(table_images[aligned_idx])
156 |                     except Exception as e:
157 |                         print(f'Gemini failed: {e}')
158 | 
159 |                 aligned_tables.append(
160 |                     (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html)
161 |                 )
162 |                 used_tables.add(aligned_idx)
163 | 
164 |             total_unaligned += len(unaligned_tables)
165 | 
166 |             for marker_table, gt_table, gemini_table in aligned_tables:
167 |                 gt_table_html = gt_table['html']
168 | 
169 |                 # marker wraps the table in <tbody> which fintabnet data doesn't
170 |                 # Fintabnet doesn't use th tags, need to be replaced for fair comparison
171 |                 marker_table_html = fix_table_html(marker_table.html)
172 |                 gemini_table_html = fix_table_html(gemini_table)
173 | 
174 |                 results.append({
175 |                     "marker_table": marker_table_html,
176 |                     "gt_table": gt_table_html,
177 |                     "gemini_table": gemini_table_html
178 |                 })
179 |         except pdfium.PdfiumError:
180 |             print('Broken PDF, Skipping...')
181 |             continue
182 |     return results, total_unaligned
```