This is page 3 of 9. Use http://codebase.md/datalab-to/marker?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .github
│ ├── ISSUE_TEMPLATE
│ │ ├── breaking-bug-report.md
│ │ ├── feature_request.md
│ │ └── output-bug-report.md
│ └── workflows
│ ├── benchmarks.yml
│ ├── ci.yml
│ ├── cla.yml
│ ├── publish.yml
│ └── scripts.yml
├── .gitignore
├── .pre-commit-config.yaml
├── benchmarks
│ ├── __init__.py
│ ├── overall
│ │ ├── __init__.py
│ │ ├── display
│ │ │ ├── __init__.py
│ │ │ ├── dataset.py
│ │ │ └── table.py
│ │ ├── download
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── llamaparse.py
│ │ │ ├── main.py
│ │ │ ├── mathpix.py
│ │ │ └── mistral.py
│ │ ├── elo.py
│ │ ├── methods
│ │ │ ├── __init__.py
│ │ │ ├── docling.py
│ │ │ ├── gt.py
│ │ │ ├── llamaparse.py
│ │ │ ├── marker.py
│ │ │ ├── mathpix.py
│ │ │ ├── mistral.py
│ │ │ ├── olmocr.py
│ │ │ └── schema.py
│ │ ├── overall.py
│ │ ├── registry.py
│ │ ├── schema.py
│ │ └── scorers
│ │ ├── __init__.py
│ │ ├── clean.py
│ │ ├── heuristic.py
│ │ ├── llm.py
│ │ └── schema.py
│ ├── table
│ │ ├── __init__.py
│ │ ├── gemini.py
│ │ ├── inference.py
│ │ ├── scoring.py
│ │ └── table.py
│ ├── throughput
│ │ ├── __init__.py
│ │ └── main.py
│ └── verify_scores.py
├── chunk_convert.py
├── CLA.md
├── convert_single.py
├── convert.py
├── data
│ ├── .gitignore
│ ├── examples
│ │ ├── json
│ │ │ ├── multicolcnn.json
│ │ │ ├── switch_trans.json
│ │ │ └── thinkpython.json
│ │ └── markdown
│ │ ├── multicolcnn
│ │ │ ├── _page_1_Figure_0.jpeg
│ │ │ ├── _page_2_Picture_0.jpeg
│ │ │ ├── _page_6_Figure_0.jpeg
│ │ │ ├── _page_7_Figure_0.jpeg
│ │ │ ├── multicolcnn_meta.json
│ │ │ └── multicolcnn.md
│ │ ├── switch_transformers
│ │ │ ├── _page_11_Figure_4.jpeg
│ │ │ ├── _page_12_Figure_4.jpeg
│ │ │ ├── _page_13_Figure_2.jpeg
│ │ │ ├── _page_18_Figure_1.jpeg
│ │ │ ├── _page_18_Figure_3.jpeg
│ │ │ ├── _page_2_Figure_3.jpeg
│ │ │ ├── _page_20_Figure_1.jpeg
│ │ │ ├── _page_20_Figure_4.jpeg
│ │ │ ├── _page_27_Figure_1.jpeg
│ │ │ ├── _page_29_Figure_1.jpeg
│ │ │ ├── _page_30_Figure_1.jpeg
│ │ │ ├── _page_31_Figure_3.jpeg
│ │ │ ├── _page_4_Figure_1.jpeg
│ │ │ ├── _page_5_Figure_3.jpeg
│ │ │ ├── switch_trans_meta.json
│ │ │ └── switch_trans.md
│ │ └── thinkpython
│ │ ├── _page_109_Figure_1.jpeg
│ │ ├── _page_115_Figure_1.jpeg
│ │ ├── _page_116_Figure_3.jpeg
│ │ ├── _page_127_Figure_1.jpeg
│ │ ├── _page_128_Figure_1.jpeg
│ │ ├── _page_167_Figure_1.jpeg
│ │ ├── _page_169_Figure_1.jpeg
│ │ ├── _page_173_Figure_1.jpeg
│ │ ├── _page_190_Figure_1.jpeg
│ │ ├── _page_195_Figure_1.jpeg
│ │ ├── _page_205_Figure_1.jpeg
│ │ ├── _page_23_Figure_1.jpeg
│ │ ├── _page_23_Figure_3.jpeg
│ │ ├── _page_230_Figure_1.jpeg
│ │ ├── _page_233_Figure_1.jpeg
│ │ ├── _page_233_Figure_3.jpeg
│ │ ├── _page_234_Figure_1.jpeg
│ │ ├── _page_235_Figure_1.jpeg
│ │ ├── _page_236_Figure_1.jpeg
│ │ ├── _page_236_Figure_3.jpeg
│ │ ├── _page_237_Figure_1.jpeg
│ │ ├── _page_238_Figure_1.jpeg
│ │ ├── _page_46_Figure_1.jpeg
│ │ ├── _page_60_Figure_1.jpeg
│ │ ├── _page_60_Figure_3.jpeg
│ │ ├── _page_67_Figure_1.jpeg
│ │ ├── _page_71_Figure_1.jpeg
│ │ ├── _page_78_Figure_1.jpeg
│ │ ├── _page_85_Figure_1.jpeg
│ │ ├── _page_94_Figure_1.jpeg
│ │ ├── _page_99_Figure_17.jpeg
│ │ ├── _page_99_Figure_178.jpeg
│ │ ├── thinkpython_meta.json
│ │ └── thinkpython.md
│ ├── images
│ │ ├── overall.png
│ │ ├── per_doc.png
│ │ └── table.png
│ └── latex_to_md.sh
├── examples
│ ├── marker_modal_deployment.py
│ └── README.md
├── extraction_app.py
├── LICENSE
├── marker
│ ├── builders
│ │ ├── __init__.py
│ │ ├── document.py
│ │ ├── layout.py
│ │ ├── line.py
│ │ ├── ocr.py
│ │ └── structure.py
│ ├── config
│ │ ├── __init__.py
│ │ ├── crawler.py
│ │ ├── parser.py
│ │ └── printer.py
│ ├── converters
│ │ ├── __init__.py
│ │ ├── extraction.py
│ │ ├── ocr.py
│ │ ├── pdf.py
│ │ └── table.py
│ ├── extractors
│ │ ├── __init__.py
│ │ ├── document.py
│ │ └── page.py
│ ├── logger.py
│ ├── models.py
│ ├── output.py
│ ├── processors
│ │ ├── __init__.py
│ │ ├── blank_page.py
│ │ ├── block_relabel.py
│ │ ├── blockquote.py
│ │ ├── code.py
│ │ ├── debug.py
│ │ ├── document_toc.py
│ │ ├── equation.py
│ │ ├── footnote.py
│ │ ├── ignoretext.py
│ │ ├── line_merge.py
│ │ ├── line_numbers.py
│ │ ├── list.py
│ │ ├── llm
│ │ │ ├── __init__.py
│ │ │ ├── llm_complex.py
│ │ │ ├── llm_equation.py
│ │ │ ├── llm_form.py
│ │ │ ├── llm_handwriting.py
│ │ │ ├── llm_image_description.py
│ │ │ ├── llm_mathblock.py
│ │ │ ├── llm_meta.py
│ │ │ ├── llm_page_correction.py
│ │ │ ├── llm_sectionheader.py
│ │ │ ├── llm_table_merge.py
│ │ │ └── llm_table.py
│ │ ├── order.py
│ │ ├── page_header.py
│ │ ├── reference.py
│ │ ├── sectionheader.py
│ │ ├── table.py
│ │ ├── text.py
│ │ └── util.py
│ ├── providers
│ │ ├── __init__.py
│ │ ├── document.py
│ │ ├── epub.py
│ │ ├── html.py
│ │ ├── image.py
│ │ ├── pdf.py
│ │ ├── powerpoint.py
│ │ ├── registry.py
│ │ ├── spreadsheet.py
│ │ └── utils.py
│ ├── renderers
│ │ ├── __init__.py
│ │ ├── chunk.py
│ │ ├── extraction.py
│ │ ├── html.py
│ │ ├── json.py
│ │ ├── markdown.py
│ │ └── ocr_json.py
│ ├── schema
│ │ ├── __init__.py
│ │ ├── blocks
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── basetable.py
│ │ │ ├── caption.py
│ │ │ ├── code.py
│ │ │ ├── complexregion.py
│ │ │ ├── equation.py
│ │ │ ├── figure.py
│ │ │ ├── footnote.py
│ │ │ ├── form.py
│ │ │ ├── handwriting.py
│ │ │ ├── inlinemath.py
│ │ │ ├── listitem.py
│ │ │ ├── pagefooter.py
│ │ │ ├── pageheader.py
│ │ │ ├── picture.py
│ │ │ ├── reference.py
│ │ │ ├── sectionheader.py
│ │ │ ├── table.py
│ │ │ ├── tablecell.py
│ │ │ ├── text.py
│ │ │ └── toc.py
│ │ ├── document.py
│ │ ├── groups
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── figure.py
│ │ │ ├── list.py
│ │ │ ├── page.py
│ │ │ ├── picture.py
│ │ │ └── table.py
│ │ ├── polygon.py
│ │ ├── registry.py
│ │ └── text
│ │ ├── __init__.py
│ │ ├── char.py
│ │ ├── line.py
│ │ └── span.py
│ ├── scripts
│ │ ├── __init__.py
│ │ ├── chunk_convert.py
│ │ ├── chunk_convert.sh
│ │ ├── common.py
│ │ ├── convert_single.py
│ │ ├── convert.py
│ │ ├── extraction_app.py
│ │ ├── file_to_s3.py
│ │ ├── run_streamlit_app.py
│ │ ├── server.py
│ │ └── streamlit_app.py
│ ├── services
│ │ ├── __init__.py
│ │ ├── azure_openai.py
│ │ ├── claude.py
│ │ ├── gemini.py
│ │ ├── ollama.py
│ │ ├── openai.py
│ │ └── vertex.py
│ ├── settings.py
│ ├── util.py
│ └── utils
│ ├── __init__.py
│ ├── batch.py
│ ├── gpu.py
│ └── image.py
├── marker_app.py
├── marker_server.py
├── MODEL_LICENSE
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── README.md
├── signatures
│ └── version1
│ └── cla.json
├── static
│ └── fonts
│ └── .gitignore
└── tests
├── builders
│ ├── test_blank_page.py
│ ├── test_document_builder.py
│ ├── test_garbled_pdf.py
│ ├── test_layout_replace.py
│ ├── test_ocr_builder.py
│ ├── test_ocr_pipeline.py
│ ├── test_overriding.py
│ ├── test_pdf_links.py
│ ├── test_rotated_bboxes.py
│ ├── test_strip_existing_ocr.py
│ └── test_structure.py
├── config
│ └── test_config.py
├── conftest.py
├── converters
│ ├── test_extraction_converter.py
│ ├── test_ocr_converter.py
│ ├── test_pdf_converter.py
│ └── test_table_converter.py
├── processors
│ ├── test_document_toc_processor.py
│ ├── test_equation_processor.py
│ ├── test_footnote_processor.py
│ ├── test_ignoretext.py
│ ├── test_llm_processors.py
│ ├── test_table_merge.py
│ └── test_table_processor.py
├── providers
│ ├── test_document_providers.py
│ ├── test_image_provider.py
│ └── test_pdf_provider.py
├── renderers
│ ├── test_chunk_renderer.py
│ ├── test_extract_images.py
│ ├── test_html_renderer.py
│ ├── test_json_renderer.py
│ └── test_markdown_renderer.py
├── schema
│ └── groups
│ └── test_list_grouping.py
├── services
│ └── test_service_init.py
└── utils.py
```
# Files
--------------------------------------------------------------------------------
/marker/services/claude.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | import time
3 | from typing import List, Annotated, T
4 |
5 | import PIL
6 | from PIL import Image
7 | import anthropic
8 | from anthropic import RateLimitError, APITimeoutError
9 | from marker.logger import get_logger
10 | from pydantic import BaseModel
11 |
12 | from marker.schema.blocks import Block
13 | from marker.services import BaseService
14 |
15 | logger = get_logger()
16 |
17 |
18 | class ClaudeService(BaseService):
19 | claude_model_name: Annotated[
20 | str, "The name of the Google model to use for the service."
21 | ] = "claude-3-7-sonnet-20250219"
22 | claude_api_key: Annotated[str, "The Claude API key to use for the service."] = None
23 | max_claude_tokens: Annotated[
24 | int, "The maximum number of tokens to use for a single Claude request."
25 | ] = 8192
26 |
27 | def process_images(self, images: List[Image.Image]) -> List[dict]:
28 | return [
29 | {
30 | "type": "image",
31 | "source": {
32 | "type": "base64",
33 | "media_type": "image/webp",
34 | "data": self.img_to_base64(img),
35 | },
36 | }
37 | for img in images
38 | ]
39 |
40 | def validate_response(self, response_text: str, schema: type[T]) -> T:
41 | response_text = response_text.strip()
42 | if response_text.startswith("```json"):
43 | response_text = response_text[7:]
44 | if response_text.endswith("```"):
45 | response_text = response_text[:-3]
46 |
47 | try:
48 | # Try to parse as JSON first
49 | out_schema = schema.model_validate_json(response_text)
50 | out_json = out_schema.model_dump()
51 | return out_json
52 | except Exception:
53 | try:
54 | # Re-parse with fixed escapes
55 | escaped_str = response_text.replace("\\", "\\\\")
56 | out_schema = schema.model_validate_json(escaped_str)
57 | return out_schema.model_dump()
58 | except Exception:
59 | return
60 |
61 | def get_client(self):
62 | return anthropic.Anthropic(
63 | api_key=self.claude_api_key,
64 | )
65 |
66 | def __call__(
67 | self,
68 | prompt: str,
69 | image: PIL.Image.Image | List[PIL.Image.Image] | None,
70 | block: Block | None,
71 | response_schema: type[BaseModel],
72 | max_retries: int | None = None,
73 | timeout: int | None = None,
74 | ):
75 | if max_retries is None:
76 | max_retries = self.max_retries
77 |
78 | if timeout is None:
79 | timeout = self.timeout
80 |
81 | schema_example = response_schema.model_json_schema()
82 | system_prompt = f"""
83 | Follow the instructions given by the user prompt. You must provide your response in JSON format matching this schema:
84 |
85 | {json.dumps(schema_example, indent=2)}
86 |
87 | Respond only with the JSON schema, nothing else. Do not include ```json, ```, or any other formatting.
88 | """.strip()
89 |
90 | client = self.get_client()
91 | image_data = self.format_image_for_llm(image)
92 |
93 | messages = [
94 | {
95 | "role": "user",
96 | "content": [
97 | *image_data,
98 | {"type": "text", "text": prompt},
99 | ],
100 | }
101 | ]
102 |
103 | total_tries = max_retries + 1
104 | for tries in range(1, total_tries + 1):
105 | try:
106 | response = client.messages.create(
107 | system=system_prompt,
108 | model=self.claude_model_name,
109 | max_tokens=self.max_claude_tokens,
110 | messages=messages,
111 | timeout=timeout,
112 | )
113 | # Extract and validate response
114 | response_text = response.content[0].text
115 | return self.validate_response(response_text, response_schema)
116 | except (RateLimitError, APITimeoutError) as e:
117 | # Rate limit exceeded
118 | if tries == total_tries:
119 | # Last attempt failed. Give up
120 | logger.error(
121 | f"Rate limit error: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})",
122 | )
123 | break
124 | else:
125 | wait_time = tries * self.retry_wait_time
126 | logger.warning(
127 | f"Rate limit error: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})",
128 | )
129 | time.sleep(wait_time)
130 | except Exception as e:
131 | logger.error(f"Error during Claude API call: {e}")
132 | break
133 |
134 | return {}
135 |
```
--------------------------------------------------------------------------------
/CLA.md:
--------------------------------------------------------------------------------
```markdown
1 | Marker Contributor Agreement
2 |
3 | This Marker Contributor Agreement ("MCA") applies to any contribution that you make to any product or project managed by us (the "project"), and sets out the intellectual property rights you grant to us in the contributed materials. The term "us" shall mean Endless Labs, Inc. The term "you" shall mean the person or entity identified below.
4 |
5 | If you agree to be bound by these terms, sign by writing "I have read the CLA document and I hereby sign the CLA" in response to the CLA bot Github comment. Read this agreement carefully before signing. These terms and conditions constitute a binding legal agreement.
6 |
7 | 1. The term 'contribution' or 'contributed materials' means any source code, object code, patch, tool, sample, graphic, specification, manual, documentation, or any other material posted or submitted by you to the project.
8 | 2. With respect to any worldwide copyrights, or copyright applications and registrations, in your contribution:
9 | - you hereby assign to us joint ownership, and to the extent that such assignment is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty free, unrestricted license to exercise all rights under those copyrights. This includes, at our option, the right to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements, including dual-license structures for commercial customers;
10 | - you agree that each of us can do all things in relation to your contribution as if each of us were the sole owners, and if one of us makes a derivative work of your contribution, the one who makes the derivative work (or has it made will be the sole owner of that derivative work;
11 | - you agree that you will not assert any moral rights in your contribution against us, our licensees or transferees;
12 | - you agree that we may register a copyright in your contribution and exercise all ownership rights associated with it; and
13 | - you agree that neither of us has any duty to consult with, obtain the consent of, pay or render an accounting to the other for any use or distribution of vour contribution.
14 | 3. With respect to any patents you own, or that you can license without payment to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty-free license to:
15 | - make, have made, use, sell, offer to sell, import, and otherwise transfer your contribution in whole or in part, alone or in combination with or included in any product, work or materials arising out of the project to which your contribution was submitted, and
16 | - at our option, to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements.
17 | If you or your affiliates institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the contribution or any project it was submitted to constitutes direct or contributory patent infringement, then any patent licenses granted to you under this agreement for that contribution shall terminate as of the date such litigation is filed.
18 | 4. Except as set out above, you keep all right, title, and interest in your contribution. The rights that you grant to us under these terms are effective on the date you first submitted a contribution to us, even if your submission took place before the date you sign these terms. Any contribution we make available under any license will also be made available under a suitable FSF (Free Software Foundation) or OSI (Open Source Initiative) approved license.
19 | 5. You covenant, represent, warrant and agree that:
20 | - each contribution that you submit is and shall be an original work of authorship and you can legally grant the rights set out in this MCA;
21 | - to the best of your knowledge, each contribution will not violate any third party's copyrights, trademarks, patents, or other intellectual property rights; and
22 | - each contribution shall be in compliance with U.S. export control laws and other applicable export and import laws.
23 | You agree to notify us if you become aware of any circumstance which would make any of the foregoing representations inaccurate in any respect. Endless Labs, Inc. may publicly disclose your participation in the project, including the fact that you have signed the MCA.
24 | 6. This MCA is governed by the laws of the State of California and applicable U.S. Federal law. Any choice of law rules will not apply.
```
--------------------------------------------------------------------------------
/marker/schema/text/line.py:
--------------------------------------------------------------------------------
```python
1 | import html
2 | import re
3 | from typing import Literal, List
4 |
5 | import regex
6 |
7 | from marker.schema import BlockTypes
8 | from marker.schema.blocks import Block, BlockOutput
9 |
10 | HYPHENS = r"-—¬"
11 |
12 |
13 | def remove_tags(text):
14 | return re.sub(r"<[^>]+>", "", text)
15 |
16 |
17 | def replace_last(string, old, new):
18 | matches = list(re.finditer(old, string))
19 | if not matches:
20 | return string
21 | last_match = matches[-1]
22 | return string[: last_match.start()] + new + string[last_match.end() :]
23 |
24 |
25 | def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str:
26 | lowercase_letters = r"\p{Ll}"
27 |
28 | hyphen_regex = regex.compile(rf".*[{HYPHENS}]\s?$", regex.DOTALL)
29 | next_line_starts_lowercase = regex.match(
30 | rf"^\s?[{lowercase_letters}]", next_line_text
31 | )
32 |
33 | if hyphen_regex.match(line_text) and next_line_starts_lowercase:
34 | line_html = replace_last(line_html, rf"[{HYPHENS}]", "")
35 |
36 | return line_html
37 |
38 |
39 | class Line(Block):
40 | block_type: BlockTypes = BlockTypes.Line
41 | block_description: str = "A line of text."
42 | formats: List[Literal["math"]] | None = (
43 | None # Sometimes we want to set math format at the line level, not span
44 | )
45 |
46 | def ocr_input_text(self, document):
47 | text = ""
48 | for block in self.contained_blocks(document, (BlockTypes.Span,)):
49 | # We don't include superscripts/subscripts and math since they can be unreliable at this stage
50 | block_text = block.text
51 | if block.italic:
52 | text += f"<i>{block_text}</i>"
53 | elif block.bold:
54 | text += f"<b>{block_text}</b>"
55 | else:
56 | text += block_text
57 |
58 | return text.strip()
59 |
60 | def formatted_text(self, document, skip_urls=False):
61 | text = ""
62 | for block in self.contained_blocks(document, (BlockTypes.Span,)):
63 | block_text = html.escape(block.text)
64 |
65 | if block.has_superscript:
66 | block_text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", block_text)
67 | if "<sup>" not in block_text:
68 | block_text = f"<sup>{block_text}</sup>"
69 |
70 | if block.url and not skip_urls:
71 | block_text = f"<a href='{block.url}'>{block_text}</a>"
72 |
73 | if block.italic:
74 | text += f"<i>{block_text}</i>"
75 | elif block.bold:
76 | text += f"<b>{block_text}</b>"
77 | elif block.math:
78 | text += f"<math display='inline'>{block_text}</math>"
79 | else:
80 | text += block_text
81 |
82 | return text
83 |
84 | def assemble_html(self, document, child_blocks, parent_structure, block_config):
85 | template = ""
86 | for c in child_blocks:
87 | template += c.html
88 |
89 | raw_text = remove_tags(template).strip()
90 | structure_idx = parent_structure.index(self.id)
91 | if structure_idx < len(parent_structure) - 1:
92 | next_block_id = parent_structure[structure_idx + 1]
93 | next_line = document.get_block(next_block_id)
94 | next_line_raw_text = next_line.raw_text(document)
95 | template = strip_trailing_hyphens(raw_text, next_line_raw_text, template)
96 | else:
97 | template = template.strip(
98 | " "
99 | ) # strip any trailing whitespace from the last line
100 | return template
101 |
102 | def render(
103 | self, document, parent_structure, section_hierarchy=None, block_config=None
104 | ):
105 | child_content = []
106 | if self.structure is not None and len(self.structure) > 0:
107 | for block_id in self.structure:
108 | block = document.get_block(block_id)
109 | child_content.append(
110 | block.render(
111 | document, parent_structure, section_hierarchy, block_config
112 | )
113 | )
114 |
115 | return BlockOutput(
116 | html=self.assemble_html(
117 | document, child_content, parent_structure, block_config
118 | ),
119 | polygon=self.polygon,
120 | id=self.id,
121 | children=[],
122 | section_hierarchy=section_hierarchy,
123 | )
124 |
125 | def merge(self, other: "Line"):
126 | self.polygon = self.polygon.merge([other.polygon])
127 |
128 | # Handle merging structure with Nones
129 | if self.structure is None:
130 | self.structure = other.structure
131 | elif other.structure is not None:
132 | self.structure = self.structure + other.structure
133 |
134 | # Merge formats with Nones
135 | if self.formats is None:
136 | self.formats = other.formats
137 | elif other.formats is not None:
138 | self.formats = list(set(self.formats + other.formats))
139 |
```
--------------------------------------------------------------------------------
/marker/processors/list.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Annotated, List, Tuple
2 |
3 | from marker.processors import BaseProcessor
4 | from marker.schema import BlockTypes
5 | from marker.schema.blocks import ListItem
6 | from marker.schema.document import Document
7 |
8 |
9 | class ListProcessor(BaseProcessor):
10 | """
11 | A processor for merging lists across pages and columns
12 | """
13 | block_types = (BlockTypes.ListGroup,)
14 | ignored_block_types: Annotated[
15 | Tuple[BlockTypes],
16 | "The list of block types to ignore when merging lists.",
17 | ] = (BlockTypes.PageHeader, BlockTypes.PageFooter)
18 | min_x_indent: Annotated[
19 | float, "The minimum horizontal indentation required to consider a block as a nested list item.",
20 | "This is expressed as a percentage of the page width and is used to determine hierarchical relationships within a list.",
21 | ] = 0.01
22 |
23 | def __init__(self, config):
24 | super().__init__(config)
25 |
26 | def __call__(self, document: Document):
27 | self.list_group_continuation(document)
28 | self.list_group_indentation(document)
29 |
30 | def list_group_continuation(self, document: Document):
31 | for page in document.pages:
32 | for block in page.contained_blocks(document, self.block_types):
33 | next_block = document.get_next_block(block, self.ignored_block_types)
34 | if next_block is None:
35 | continue
36 | if next_block.block_type not in self.block_types:
37 | continue
38 | if next_block.structure is None:
39 | continue
40 | if next_block.ignore_for_output:
41 | continue
42 |
43 | column_break, page_break = False, False
44 | next_block_in_first_quadrant = False
45 |
46 | if next_block.page_id == block.page_id: # block on the same page
47 | # we check for a column break
48 | column_break = next_block.polygon.y_start <= block.polygon.y_end
49 | else:
50 | page_break = True
51 | next_page = document.get_page(next_block.page_id)
52 | next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \
53 | (next_block.polygon.y_start < next_page.polygon.height // 2)
54 |
55 | block.has_continuation = column_break or (page_break and next_block_in_first_quadrant)
56 |
57 | def list_group_indentation(self, document: Document):
58 | for page in document.pages:
59 | for block in page.contained_blocks(document, self.block_types):
60 | if block.structure is None:
61 | continue
62 | if block.ignore_for_output:
63 | continue
64 |
65 | stack: List[ListItem] = [block.get_next_block(page, None)]
66 | for list_item_id in block.structure:
67 | list_item_block: ListItem = page.get_block(list_item_id)
68 |
69 | # This can be a line sometimes
70 | if list_item_block.block_type != BlockTypes.ListItem:
71 | continue
72 |
73 | while stack and list_item_block.polygon.x_start <= stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
74 | stack.pop()
75 |
76 | if stack and list_item_block.polygon.y_start > stack[-1].polygon.y_start:
77 | list_item_block.list_indent_level = stack[-1].list_indent_level
78 | if list_item_block.polygon.x_start > stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
79 | list_item_block.list_indent_level += 1
80 |
81 | next_list_item_block = block.get_next_block(page, list_item_block)
82 | if next_list_item_block is not None and next_list_item_block.polygon.x_start > list_item_block.polygon.x_end:
83 | stack = [next_list_item_block] # reset stack on column breaks
84 | else:
85 | stack.append(list_item_block)
86 |
87 | stack: List[ListItem] = [block.get_next_block(page, None)]
88 | for list_item_id in block.structure.copy():
89 | list_item_block: ListItem = page.get_block(list_item_id)
90 |
91 | while stack and list_item_block.list_indent_level <= stack[-1].list_indent_level:
92 | stack.pop()
93 |
94 | if stack:
95 | current_parent = stack[-1]
96 | current_parent.add_structure(list_item_block)
97 | current_parent.polygon = current_parent.polygon.merge([list_item_block.polygon])
98 |
99 | block.remove_structure_items([list_item_id])
100 | stack.append(list_item_block)
101 |
```
--------------------------------------------------------------------------------
/marker/config/crawler.py:
--------------------------------------------------------------------------------
```python
1 | import importlib
2 | import inspect
3 | import pkgutil
4 | from functools import cached_property
5 | from typing import Annotated, Dict, Set, Type, get_args, get_origin
6 |
7 | from marker.builders import BaseBuilder
8 | from marker.converters import BaseConverter
9 | from marker.extractors import BaseExtractor
10 | from marker.processors import BaseProcessor
11 | from marker.providers import BaseProvider
12 | from marker.renderers import BaseRenderer
13 | from marker.services import BaseService
14 |
15 |
16 | class ConfigCrawler:
17 | def __init__(
18 | self,
19 | base_classes=(
20 | BaseBuilder,
21 | BaseProcessor,
22 | BaseConverter,
23 | BaseProvider,
24 | BaseRenderer,
25 | BaseService,
26 | BaseExtractor,
27 | ),
28 | ):
29 | self.base_classes = base_classes
30 | self.class_config_map: Dict[str, dict] = {}
31 |
32 | self._crawl_config()
33 |
34 | def _crawl_config(self):
35 | for base in self.base_classes:
36 | base_class_type = base.__name__.removeprefix("Base")
37 | self.class_config_map.setdefault(base_class_type, {})
38 | for class_name, class_type in self._find_subclasses(base).items():
39 | if class_name.startswith("Base"):
40 | continue
41 |
42 | self.class_config_map[base_class_type].setdefault(
43 | class_name, {"class_type": class_type, "config": {}}
44 | )
45 | for attr, attr_type in self._gather_super_annotations(
46 | class_type
47 | ).items():
48 | default = getattr(class_type, attr)
49 | metadata = (f"Default is {default}.",)
50 |
51 | if get_origin(attr_type) is Annotated:
52 | if any("Default" in desc for desc in attr_type.__metadata__):
53 | metadata = attr_type.__metadata__
54 | else:
55 | metadata = attr_type.__metadata__ + metadata
56 | attr_type = get_args(attr_type)[0]
57 |
58 | formatted_type = self._format_type(attr_type)
59 | self.class_config_map[base_class_type][class_name]["config"][
60 | attr
61 | ] = (attr_type, formatted_type, default, metadata)
62 |
63 | @staticmethod
64 | def _gather_super_annotations(cls: Type) -> Dict[str, Type]:
65 | """
66 | Collect all annotated attributes from `cls` and its superclasses, bottom-up.
67 | Subclass attributes overwrite superclass attributes with the same name.
68 | """
69 | # We'll walk the MRO from base -> derived so subclass attributes overwrite
70 | # the same attribute name from superclasses.
71 | annotations = {}
72 | for base in reversed(cls.__mro__):
73 | if base is object:
74 | continue
75 | if hasattr(base, "__annotations__"):
76 | for name, annotation in base.__annotations__.items():
77 | annotations[name] = annotation
78 | return annotations
79 |
80 | @cached_property
81 | def attr_counts(self) -> Dict[str, int]:
82 | counts: Dict[str, int] = {}
83 | for base_type_dict in self.class_config_map.values():
84 | for class_map in base_type_dict.values():
85 | for attr in class_map["config"].keys():
86 | counts[attr] = counts.get(attr, 0) + 1
87 | return counts
88 |
89 | @cached_property
90 | def attr_set(self) -> Set[str]:
91 | attr_set: Set[str] = set()
92 | for base_type_dict in self.class_config_map.values():
93 | for class_name, class_map in base_type_dict.items():
94 | for attr in class_map["config"].keys():
95 | attr_set.add(attr)
96 | attr_set.add(f"{class_name}_{attr}")
97 | return attr_set
98 |
99 | def _find_subclasses(self, base_class):
100 | subclasses = {}
101 | module_name = base_class.__module__
102 | package = importlib.import_module(module_name)
103 | if hasattr(package, "__path__"):
104 | for _, module_name, _ in pkgutil.walk_packages(
105 | package.__path__, module_name + "."
106 | ):
107 | try:
108 | module = importlib.import_module(module_name)
109 | for name, obj in inspect.getmembers(module, inspect.isclass):
110 | if issubclass(obj, base_class) and obj is not base_class:
111 | subclasses[name] = obj
112 | except ImportError:
113 | pass
114 | return subclasses
115 |
116 | def _format_type(self, t: Type) -> str:
117 | """Format a typing type like Optional[int] into a readable string."""
118 |
119 | if get_origin(t): # Handle Optional and types with origins separately
120 | return f"{t}".removeprefix("typing.")
121 | else: # Regular types like int, str
122 | return t.__name__
123 |
124 |
125 | crawler = ConfigCrawler()
126 |
```
--------------------------------------------------------------------------------
/marker/processors/line_merge.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Annotated, List
2 |
3 | from marker.processors import BaseProcessor
4 | from marker.schema import BlockTypes
5 | from marker.schema.blocks import Block
6 | from marker.schema.document import Document
7 | from marker.schema.text import Line
8 | from marker.util import matrix_intersection_area
9 |
10 |
11 | class LineMergeProcessor(BaseProcessor):
12 | """
13 | A processor for merging inline math lines.
14 | """
15 | block_types = (BlockTypes.Text, BlockTypes.TextInlineMath, BlockTypes.Caption, BlockTypes.Footnote, BlockTypes.SectionHeader)
16 | min_merge_pct: Annotated[
17 | float,
18 | "The minimum percentage of intersection area to consider merging."
19 | ] = .015
20 | block_expand_threshold: Annotated[
21 | float,
22 | "The percentage of the block width to expand the bounding box."
23 | ] = .05
24 | min_merge_ydist: Annotated[
25 | float,
26 | "The minimum y distance between lines to consider merging."
27 | ] = 5
28 | intersection_pct_threshold: Annotated[
29 | float,
30 | "The total amount of intersection area concentrated in the max intersection block."
31 | ] = .5
32 | vertical_overlap_pct_threshold: Annotated[
33 | float,
34 | "The minimum percentage of vertical overlap to consider merging."
35 | ] = .8
36 | use_llm: Annotated[
37 | bool,
38 | "Whether to use LLMs to improve accuracy."
39 | ] = False
40 |
41 | def __init__(self, config):
42 | super().__init__(config)
43 |
44 | def merge_lines(self, lines: List[Line], block: Block):
45 | lines = [l for l in lines if l.polygon.width * 5 > l.polygon.height] # Skip vertical lines
46 | line_bboxes = [l.polygon.expand(self.block_expand_threshold, 0).bbox for l in lines] # Expand horizontally
47 | intersections = matrix_intersection_area(line_bboxes, line_bboxes)
48 |
49 | merges = []
50 | merge = []
51 | for i in range(len(line_bboxes)):
52 | intersection_row = intersections[i]
53 | intersection_row[i] = 0 # Zero out the current idx
54 |
55 | if i < len(line_bboxes) - 1:
56 | intersection_row[i+1] = 0 # Zero out the next idx, so we only evaluate merge from the left
57 |
58 | if len(merge) == 0:
59 | merge.append(i)
60 | continue
61 |
62 | # Zero out previous merge segments
63 | merge_intersection = sum([intersection_row[m] for m in merge])
64 | line_area = lines[i].polygon.area
65 | intersection_pct = merge_intersection / max(1, line_area)
66 |
67 | total_intersection = max(1, sum(intersection_row))
68 |
69 | line_start = lines[merge[0]].polygon.y_start
70 | line_end = lines[merge[0]].polygon.y_end
71 |
72 | vertical_overlap_start = max(line_start, lines[i].polygon.y_start)
73 | vertical_overlap_end = min(line_end, lines[i].polygon.y_end)
74 | vertical_overlap = max(0, vertical_overlap_end - vertical_overlap_start)
75 | vertical_overlap_pct = vertical_overlap / max(1, lines[i].polygon.height)
76 |
77 | if all([
78 | # Overlaps enough
79 | intersection_pct >= self.min_merge_pct,
80 | # Within same line
81 | vertical_overlap_pct > self.vertical_overlap_pct_threshold,
82 | # doesn't overlap with anything else
83 | merge_intersection / total_intersection > self.intersection_pct_threshold
84 | ]):
85 | merge.append(i)
86 | else:
87 | merges.append(merge)
88 | merge = []
89 |
90 | if merge:
91 | merges.append(merge)
92 |
93 | merges = [m for m in merges if len(m) > 1]
94 | merged = set()
95 | for merge in merges:
96 | merge = [m for m in merge if m not in merged]
97 | if len(merge) < 2:
98 | continue
99 |
100 | line: Line = lines[merge[0]]
101 | merged.add(merge[0])
102 | for idx in merge[1:]:
103 | other_line: Line = lines[idx]
104 | line.merge(other_line)
105 | block.structure.remove(other_line.id)
106 | other_line.removed = True # Mark line as removed
107 | merged.add(idx)
108 |
109 | # It is probably math if we are merging provider lines like this
110 | if not line.formats:
111 | line.formats = ["math"]
112 | elif "math" not in line.formats:
113 | line.formats.append("math")
114 |
115 |
116 | def __call__(self, document: Document):
117 | # Merging lines only needed for inline math
118 | if not self.use_llm:
119 | return
120 |
121 | for page in document.pages:
122 | for block in page.contained_blocks(document, self.block_types):
123 | if block.structure is None:
124 | continue
125 |
126 | if not len(block.structure) >= 2: # Skip single lines
127 | continue
128 |
129 | lines = block.contained_blocks(document, (BlockTypes.Line,))
130 | self.merge_lines(lines, block)
131 |
```
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
```python
1 | import tempfile
2 | from typing import Dict, Type
3 |
4 | from PIL import Image, ImageDraw
5 |
6 | import datasets
7 | import pytest
8 |
9 | from marker.builders.document import DocumentBuilder
10 | from marker.builders.layout import LayoutBuilder
11 | from marker.builders.line import LineBuilder
12 | from marker.builders.ocr import OcrBuilder
13 | from marker.builders.structure import StructureBuilder
14 | from marker.converters.pdf import PdfConverter
15 | from marker.models import create_model_dict
16 | from marker.providers.registry import provider_from_filepath
17 | from marker.renderers.chunk import ChunkRenderer
18 | from marker.renderers.html import HTMLRenderer
19 | from marker.schema import BlockTypes
20 | from marker.schema.blocks import Block
21 | from marker.renderers.markdown import MarkdownRenderer
22 | from marker.renderers.json import JSONRenderer
23 | from marker.schema.registry import register_block_class
24 | from marker.util import classes_to_strings, strings_to_classes
25 |
26 |
27 | @pytest.fixture(scope="session")
28 | def model_dict():
29 | model_dict = create_model_dict()
30 | yield model_dict
31 | del model_dict
32 |
33 |
34 | @pytest.fixture(scope="session")
35 | def layout_model(model_dict):
36 | yield model_dict["layout_model"]
37 |
38 |
39 | @pytest.fixture(scope="session")
40 | def detection_model(model_dict):
41 | yield model_dict["detection_model"]
42 |
43 |
44 | @pytest.fixture(scope="session")
45 | def recognition_model(model_dict):
46 | yield model_dict["recognition_model"]
47 |
48 |
49 | @pytest.fixture(scope="session")
50 | def table_rec_model(model_dict):
51 | yield model_dict["table_rec_model"]
52 |
53 |
54 | @pytest.fixture(scope="session")
55 | def ocr_error_model(model_dict):
56 | yield model_dict["ocr_error_model"]
57 |
58 |
59 | @pytest.fixture(scope="function")
60 | def config(request):
61 | config_mark = request.node.get_closest_marker("config")
62 | config = config_mark.args[0] if config_mark else {}
63 |
64 | override_map: Dict[BlockTypes, Type[Block]] = config.get("override_map", {})
65 | for block_type, override_block_type in override_map.items():
66 | register_block_class(block_type, override_block_type)
67 |
68 | return config
69 |
70 |
71 | @pytest.fixture(scope="session")
72 | def pdf_dataset():
73 | return datasets.load_dataset("datalab-to/pdfs", split="train")
74 |
75 |
76 | @pytest.fixture(scope="function")
77 | def temp_doc(request, pdf_dataset):
78 | filename_mark = request.node.get_closest_marker("filename")
79 | filename = filename_mark.args[0] if filename_mark else "adversarial.pdf"
80 |
81 | idx = pdf_dataset["filename"].index(filename)
82 | suffix = filename.split(".")[-1]
83 |
84 | temp_pdf = tempfile.NamedTemporaryFile(suffix=f".{suffix}")
85 | temp_pdf.write(pdf_dataset["pdf"][idx])
86 | temp_pdf.flush()
87 | yield temp_pdf
88 |
89 |
90 | @pytest.fixture(scope="function")
91 | def doc_provider(request, config, temp_doc):
92 | provider_cls = provider_from_filepath(temp_doc.name)
93 | yield provider_cls(temp_doc.name, config)
94 |
95 |
96 | @pytest.fixture(scope="function")
97 | def pdf_document(
98 | request,
99 | config,
100 | doc_provider,
101 | layout_model,
102 | ocr_error_model,
103 | recognition_model,
104 | detection_model,
105 | ):
106 | layout_builder = LayoutBuilder(layout_model, config)
107 | line_builder = LineBuilder(detection_model, ocr_error_model, config)
108 | ocr_builder = OcrBuilder(recognition_model, config)
109 | builder = DocumentBuilder(config)
110 | structure_builder = StructureBuilder(config)
111 | document = builder(doc_provider, layout_builder, line_builder, ocr_builder)
112 | structure_builder(document)
113 | yield document
114 |
115 |
116 | @pytest.fixture(scope="function")
117 | def pdf_converter(request, config, model_dict, renderer, llm_service):
118 | if llm_service:
119 | llm_service = classes_to_strings([llm_service])[0]
120 | yield PdfConverter(
121 | artifact_dict=model_dict,
122 | processor_list=None,
123 | renderer=classes_to_strings([renderer])[0],
124 | config=config,
125 | llm_service=llm_service,
126 | )
127 |
128 |
129 | @pytest.fixture(scope="function")
130 | def renderer(request, config):
131 | if request.node.get_closest_marker("output_format"):
132 | output_format = request.node.get_closest_marker("output_format").args[0]
133 | if output_format == "markdown":
134 | return MarkdownRenderer
135 | elif output_format == "json":
136 | return JSONRenderer
137 | elif output_format == "html":
138 | return HTMLRenderer
139 | elif output_format == "chunks":
140 | return ChunkRenderer
141 | else:
142 | raise ValueError(f"Unknown output format: {output_format}")
143 | else:
144 | return MarkdownRenderer
145 |
146 |
147 | @pytest.fixture(scope="function")
148 | def llm_service(request, config):
149 | llm_service = config.get("llm_service")
150 | if not llm_service:
151 | yield None
152 | else:
153 | yield strings_to_classes([llm_service])[0]
154 |
155 |
156 | @pytest.fixture(scope="function")
157 | def temp_image():
158 | img = Image.new("RGB", (512, 512), color="white")
159 | draw = ImageDraw.Draw(img)
160 | draw.text((200, 200), "Hello, World!", fill="black", font_size=36)
161 | with tempfile.NamedTemporaryFile(suffix=".png") as f:
162 | img.save(f.name)
163 | f.flush()
164 | yield f
165 |
```
--------------------------------------------------------------------------------
/marker/extractors/document.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 |
3 | from pydantic import BaseModel
4 | from typing import Annotated, Optional, List
5 |
6 | from marker.extractors import BaseExtractor
7 | from marker.extractors.page import PageExtractionSchema
8 | from marker.logger import get_logger
9 |
10 | logger = get_logger()
11 |
12 |
13 | class DocumentExtractionSchema(BaseModel):
14 | analysis: str
15 | document_json: str
16 |
17 |
18 | class DocumentExtractor(BaseExtractor):
19 | """
20 | An extractor that combines data from across all pages.
21 | """
22 |
23 | page_schema: Annotated[
24 | str,
25 | "The JSON schema to be extracted from the page.",
26 | ] = ""
27 |
28 | page_extraction_prompt = """You are an expert document analyst who reads documents and pulls data out in JSON format. You will receive your detailed notes from all the pages of a document, and a JSON schema that we want to extract from the document. Your task is to extract all the information properly into the JSON schema.
29 |
30 | Some notes:
31 | - The schema may contain a single object to extract from the entire document, or an array of objects.
32 | - The schema may contain nested objects, arrays, and other complex structures.
33 |
34 | Some guidelines:
35 | - Some entities will span multiple pages, so make sure to consult your notes thoroughly.
36 | - In the case of potential conflicting values, pull out the values you have the most confidence in, from your notes.
37 | - If you cannot find a value for a field, leave it blank in the JSON.
38 |
39 | **Instructions:**
40 | 1. Analyze your provided notes.
41 | 2. Analyze the JSON schema.
42 | 3. Write a detailed analysis of the notes, and the associated values in the schema. Make sure to reference which page each piece of information comes from.
43 | 4. Write the output in the JSON schema format, ensuring all required fields are filled out. Output only the json data, without any additional text or formatting.
44 |
45 | **Example:**
46 | Input:
47 |
48 | Detailed Notes
49 | Page 0
50 | On this page, I see a table with car makes and sales. The makes are Honda and Toyota, with sales of 100 and 200 respectively. The color is not present in the table, so I will leave it blank in the JSON. That information may be present on another page. Some JSON snippets I may find useful later are:
51 | ```json
52 | {
53 | "make": "Honda",
54 | "sales": 100,
55 | }
56 | ```
57 | ```json
58 | {
59 | "make": "Toyota",
60 | "sales": 200,
61 | }
62 | ```
63 |
64 | Honda is the first row in the table, and Toyota is the second row. Make is the first column, and sales is the second.
65 |
66 | Page 1
67 | I see a table that contains 2 rows, and has a color header. The first row has the color red, and the second row has the color blue. Here are some useful snippets:
68 |
69 | Schema
70 |
71 | ```json
72 | {'$defs': {'Cars': {'properties': {'make': {'title': 'Make', 'type': 'string'}, 'sales': {'title': 'Sales', 'type': 'integer'}, 'color': {'title': 'Color', 'type': 'string'}}, 'required': ['make', 'sales', 'color'], 'title': 'Cars', 'type': 'object'}}, 'properties': {'cars': {'items': {'$ref': '#/$defs/Cars'}, 'title': 'Cars', 'type': 'array'}}, 'required': ['cars'], 'title': 'CarsList', 'type': 'object'}
73 | ```
74 |
75 | Output:
76 |
77 | Analysis: From the notes, it looks like the information I need is in a table that spans 2 pages. The first page has the makes and sales, while the second page has the colors. I will combine this information into the JSON schema.
78 | JSON
79 |
80 | {
81 | "cars": [
82 | {
83 | "make": "Honda",
84 | "sales": 100,
85 | "color": "red"
86 | },
87 | {
88 | "make": "Toyota",
89 | "sales": 200,
90 | "color": "blue"
91 | }
92 | ]
93 | }
94 |
95 | **Input:**
96 |
97 | Detailed Notes
98 | {{document_notes}}
99 |
100 | Schema
101 | ```json
102 | {{schema}}
103 | ```
104 | """
105 |
106 | def assemble_document_notes(self, page_notes: List[PageExtractionSchema]) -> str:
107 | notes = ""
108 | for i, page_schema in enumerate(page_notes):
109 | if not page_notes:
110 | continue
111 | notes += f"Page {i + 1}\n{page_schema.detailed_notes}\n\n"
112 | return notes.strip()
113 |
114 | def __call__(
115 | self,
116 | page_notes: List[PageExtractionSchema],
117 | **kwargs,
118 | ) -> Optional[DocumentExtractionSchema]:
119 | if not self.page_schema:
120 | raise ValueError(
121 | "Page schema must be defined for structured extraction to work."
122 | )
123 |
124 | prompt = self.page_extraction_prompt.replace(
125 | "{{document_notes}}", self.assemble_document_notes(page_notes)
126 | ).replace("{{schema}}", json.dumps(self.page_schema))
127 | response = self.llm_service(prompt, None, None, DocumentExtractionSchema)
128 |
129 | logger.debug(f"Document extraction response: {response}")
130 |
131 | if not response or any(
132 | [
133 | key not in response
134 | for key in [
135 | "analysis",
136 | "document_json",
137 | ]
138 | ]
139 | ):
140 | return None
141 |
142 | json_data = response["document_json"].strip().lstrip("```json").rstrip("```")
143 |
144 | return DocumentExtractionSchema(
145 | analysis=response["analysis"], document_json=json_data
146 | )
147 |
```
--------------------------------------------------------------------------------
/marker/scripts/server.py:
--------------------------------------------------------------------------------
```python
1 | import traceback
2 |
3 | import click
4 | import os
5 |
6 | from pydantic import BaseModel, Field
7 | from starlette.responses import HTMLResponse
8 |
9 | from marker.config.parser import ConfigParser
10 | from marker.output import text_from_rendered
11 |
12 | import base64
13 | from contextlib import asynccontextmanager
14 | from typing import Optional, Annotated
15 | import io
16 |
17 | from fastapi import FastAPI, Form, File, UploadFile
18 | from marker.converters.pdf import PdfConverter
19 | from marker.models import create_model_dict
20 | from marker.settings import settings
21 |
22 | app_data = {}
23 |
24 |
25 | UPLOAD_DIRECTORY = "./uploads"
26 | os.makedirs(UPLOAD_DIRECTORY, exist_ok=True)
27 |
28 |
29 | @asynccontextmanager
30 | async def lifespan(app: FastAPI):
31 | app_data["models"] = create_model_dict()
32 |
33 | yield
34 |
35 | if "models" in app_data:
36 | del app_data["models"]
37 |
38 |
39 | app = FastAPI(lifespan=lifespan)
40 |
41 |
42 | @app.get("/")
43 | async def root():
44 | return HTMLResponse(
45 | """
46 | <h1>Marker API</h1>
47 | <ul>
48 | <li><a href="/docs">API Documentation</a></li>
49 | <li><a href="/marker">Run marker (post request only)</a></li>
50 | </ul>
51 | """
52 | )
53 |
54 |
55 | class CommonParams(BaseModel):
56 | filepath: Annotated[
57 | Optional[str], Field(description="The path to the PDF file to convert.")
58 | ]
59 | page_range: Annotated[
60 | Optional[str],
61 | Field(
62 | description="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20",
63 | example=None,
64 | ),
65 | ] = None
66 | force_ocr: Annotated[
67 | bool,
68 | Field(
69 | description="Force OCR on all pages of the PDF. Defaults to False. This can lead to worse results if you have good text in your PDFs (which is true in most cases)."
70 | ),
71 | ] = False
72 | paginate_output: Annotated[
73 | bool,
74 | Field(
75 | description="Whether to paginate the output. Defaults to False. If set to True, each page of the output will be separated by a horizontal rule that contains the page number (2 newlines, {PAGE_NUMBER}, 48 - characters, 2 newlines)."
76 | ),
77 | ] = False
78 | output_format: Annotated[
79 | str,
80 | Field(
81 | description="The format to output the text in. Can be 'markdown', 'json', or 'html'. Defaults to 'markdown'."
82 | ),
83 | ] = "markdown"
84 |
85 |
86 | async def _convert_pdf(params: CommonParams):
87 | assert params.output_format in ["markdown", "json", "html", "chunks"], (
88 | "Invalid output format"
89 | )
90 | try:
91 | options = params.model_dump()
92 | config_parser = ConfigParser(options)
93 | config_dict = config_parser.generate_config_dict()
94 | config_dict["pdftext_workers"] = 1
95 | converter_cls = PdfConverter
96 | converter = converter_cls(
97 | config=config_dict,
98 | artifact_dict=app_data["models"],
99 | processor_list=config_parser.get_processors(),
100 | renderer=config_parser.get_renderer(),
101 | llm_service=config_parser.get_llm_service(),
102 | )
103 | rendered = converter(params.filepath)
104 | text, _, images = text_from_rendered(rendered)
105 | metadata = rendered.metadata
106 | except Exception as e:
107 | traceback.print_exc()
108 | return {
109 | "success": False,
110 | "error": str(e),
111 | }
112 |
113 | encoded = {}
114 | for k, v in images.items():
115 | byte_stream = io.BytesIO()
116 | v.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT)
117 | encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(
118 | settings.OUTPUT_ENCODING
119 | )
120 |
121 | return {
122 | "format": params.output_format,
123 | "output": text,
124 | "images": encoded,
125 | "metadata": metadata,
126 | "success": True,
127 | }
128 |
129 |
130 | @app.post("/marker")
131 | async def convert_pdf(params: CommonParams):
132 | return await _convert_pdf(params)
133 |
134 |
135 | @app.post("/marker/upload")
136 | async def convert_pdf_upload(
137 | page_range: Optional[str] = Form(default=None),
138 | force_ocr: Optional[bool] = Form(default=False),
139 | paginate_output: Optional[bool] = Form(default=False),
140 | output_format: Optional[str] = Form(default="markdown"),
141 | file: UploadFile = File(
142 | ..., description="The PDF file to convert.", media_type="application/pdf"
143 | ),
144 | ):
145 | upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
146 | with open(upload_path, "wb+") as upload_file:
147 | file_contents = await file.read()
148 | upload_file.write(file_contents)
149 |
150 | params = CommonParams(
151 | filepath=upload_path,
152 | page_range=page_range,
153 | force_ocr=force_ocr,
154 | paginate_output=paginate_output,
155 | output_format=output_format,
156 | )
157 | results = await _convert_pdf(params)
158 | os.remove(upload_path)
159 | return results
160 |
161 |
162 | @click.command()
163 | @click.option("--port", type=int, default=8000, help="Port to run the server on")
164 | @click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")
165 | def server_cli(port: int, host: str):
166 | import uvicorn
167 |
168 | # Run the server
169 | uvicorn.run(
170 | app,
171 | host=host,
172 | port=port,
173 | )
174 |
```
--------------------------------------------------------------------------------
/marker/processors/equation.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Annotated, List, Tuple
2 | from PIL import Image
3 | import re
4 | from bs4 import BeautifulSoup
5 |
6 | from ftfy import fix_text, TextFixerConfig
7 | from surya.recognition import RecognitionPredictor, OCRResult
8 |
9 | from marker.processors import BaseProcessor
10 | from marker.schema import BlockTypes
11 | from marker.schema.document import Document
12 | from marker.settings import settings
13 |
14 | MATH_TAG_PATTERN = re.compile(r"<math[^>]*>(.*?)</math>")
15 |
16 |
17 | class EquationProcessor(BaseProcessor):
18 | """
19 | A processor for recognizing equations in the document.
20 | """
21 |
22 | block_types: Annotated[
23 | Tuple[BlockTypes],
24 | "The block types to process.",
25 | ] = (BlockTypes.Equation,)
26 | model_max_length: Annotated[
27 | int,
28 | "The maximum number of tokens to allow for the Recognition model.",
29 | ] = 1024
30 | equation_batch_size: Annotated[
31 | int,
32 | "The batch size to use for the recognition model while processing equations.",
33 | "Default is None, which will use the default batch size for the model.",
34 | ] = None
35 | disable_tqdm: Annotated[
36 | bool,
37 | "Whether to disable the tqdm progress bar.",
38 | ] = False
39 | drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False
40 |
41 | def __init__(self, recognition_model: RecognitionPredictor, config=None):
42 | super().__init__(config)
43 |
44 | self.recognition_model = recognition_model
45 |
46 | def get_batch_size(self):
47 | # Set to 1/4th of OCR batch size due to sequence length with tiling
48 | if self.equation_batch_size is not None:
49 | return self.equation_batch_size
50 | elif settings.TORCH_DEVICE_MODEL == "cuda":
51 | return 32
52 | elif settings.TORCH_DEVICE_MODEL == "mps":
53 | return 6
54 | return 6
55 |
56 | def __call__(self, document: Document):
57 | images = []
58 | equation_boxes = []
59 | equation_block_ids = []
60 | total_equation_blocks = 0
61 |
62 | for page in document.pages:
63 | page_image = page.get_image(highres=True)
64 | page_size = page.polygon.width, page.polygon.height
65 | image_size = page_image.size
66 |
67 | page_equation_boxes = []
68 | page_equation_block_ids = []
69 | equation_blocks = page.contained_blocks(document, self.block_types)
70 | for block in equation_blocks:
71 | page_equation_boxes.append(
72 | block.polygon.rescale(page_size, image_size).bbox
73 | )
74 | page_equation_block_ids.append(block.id)
75 | total_equation_blocks += 1
76 |
77 | images.append(page_image)
78 | equation_boxes.append(page_equation_boxes)
79 | equation_block_ids.append(page_equation_block_ids)
80 |
81 | if total_equation_blocks == 0:
82 | return
83 |
84 | predictions = self.get_latex_batched(images, equation_boxes)
85 | for page_predictions, page_equation_block_ids in zip(
86 | predictions, equation_block_ids
87 | ):
88 | assert len(page_predictions) == len(page_equation_block_ids), (
89 | "Every equation block should have a corresponding prediction"
90 | )
91 | for block_prediction, block_id in zip(
92 | page_predictions, page_equation_block_ids
93 | ):
94 | block = document.get_block(block_id)
95 | block.html = self.fix_latex(block_prediction)
96 |
97 | def fix_latex(self, math_html: str):
98 | math_html = math_html.strip()
99 | soup = BeautifulSoup(math_html, "html.parser")
100 | opening_math_tag = soup.find("math")
101 |
102 | # No math block found
103 | if not opening_math_tag:
104 | return ""
105 |
106 | # Force block format
107 | opening_math_tag.attrs["display"] = "block"
108 | fixed_math_html = str(soup)
109 |
110 | # Sometimes model outputs newlines at the beginning/end of tags
111 | fixed_math_html = re.sub(
112 | r"^<math display=\"block\">\\n(?![a-zA-Z])",
113 | '<math display="block">',
114 | fixed_math_html,
115 | )
116 | fixed_math_html = re.sub(r"\\n</math>$", "</math>", fixed_math_html)
117 | fixed_math_html = re.sub(r"<br>", "", fixed_math_html)
118 | fixed_math_html = fix_text(
119 | fixed_math_html, config=TextFixerConfig(unescape_html=True)
120 | )
121 | return fixed_math_html
122 |
123 | def get_latex_batched(
124 | self,
125 | page_images: List[Image.Image],
126 | bboxes: List[List[List[float]]],
127 | ):
128 | self.recognition_model.disable_tqdm = self.disable_tqdm
129 | predictions: List[OCRResult] = self.recognition_model(
130 | images=page_images,
131 | bboxes=bboxes,
132 | task_names=["ocr_with_boxes"] * len(page_images),
133 | recognition_batch_size=self.get_batch_size(),
134 | sort_lines=False,
135 | drop_repeated_text=self.drop_repeated_text,
136 | max_tokens=2048,
137 | max_sliding_window=2148,
138 | )
139 |
140 | equation_predictions = [
141 | [line.text.strip() for line in page_prediction.text_lines]
142 | for page_prediction in predictions
143 | ]
144 |
145 | return equation_predictions
146 |
```
--------------------------------------------------------------------------------
/marker/processors/llm/llm_equation.py:
--------------------------------------------------------------------------------
```python
1 | from pydantic import BaseModel
2 |
3 | from marker.processors.llm import BaseLLMSimpleBlockProcessor, PromptData, BlockData
4 | from marker.schema import BlockTypes
5 | from marker.schema.document import Document
6 |
7 | from typing import Annotated, List
8 |
9 |
10 | class LLMEquationProcessor(BaseLLMSimpleBlockProcessor):
11 | block_types = (BlockTypes.Equation,)
12 | min_equation_height: Annotated[
13 | float,
14 | "The minimum ratio between equation height and page height to consider for processing.",
15 | ] = 0.06
16 | image_expansion_ratio: Annotated[
17 | float,
18 | "The ratio to expand the image by when cropping.",
19 | ] = 0.05 # Equations sometimes get bboxes that are too tight
20 | redo_inline_math: Annotated[
21 | bool,
22 | "Whether to redo inline math blocks.",
23 | ] = False
24 | equation_latex_prompt: Annotated[
25 | str,
26 | "The prompt to use for generating LaTeX from equations.",
27 | "Default is a string containing the Gemini prompt."
28 | ] = r"""You're an expert mathematician who is good at writing LaTeX code and html for equations.
29 | You'll receive an image of a math block, along with the text extracted from the block. It may contain one or more equations. Your job is to write html that represents the content of the image, with the equations in LaTeX format.
30 |
31 | Some guidelines:
32 | - Output valid html, where all the equations can render properly.
33 | - Use <math display="block"> as a block equation delimiter and <math> for inline equations. Do not use $ or $$ as delimiters.
34 | - Keep the LaTeX code inside the math tags simple, concise, and KaTeX compatible.
35 | - Enclose all equations in the correct math tags. Use multiple math tags inside the html to represent multiple equations.
36 | - Only use the html tags math, i, b, p, and br.
37 | - Make sure to include all the equations in the image in the html output.
38 | - Make sure to include other text in the image in the correct positions along with the equations.
39 |
40 | **Instructions:**
41 | 1. Carefully examine the provided image.
42 | 2. Analyze the existing html, which may include LaTeX code.
43 | 3. Write a short analysis of how the html should be corrected to represent the image.
44 | 4. If the html and LaTeX are correct, write "No corrections needed."
45 | 5. If the html and LaTeX are incorrect, generate the corrected html.
46 | 6. Output only the analysis, then the corrected html or "No corrections needed."
47 | **Example:**
48 | Input:
49 | ```html
50 | The following equation illustrates the Pythagorean theorem:
51 | x2 + y2 = z2
52 |
53 | And this equation is a bit more complex:
54 | (ab * x5 + x2 + 2 * x + 123)/t
55 | ```
56 | Output:
57 | analysis: The equations are not formatted as LaTeX, or enclosed in math tags.
58 | ```html
59 | <p>The following equation illustrates the Pythagorean theorem:</p>
60 | <math display="block">x^{2} + y^{2} = z^{2}</math>
61 |
62 | <p>And this equation is a bit more complex, and contains <math>ab \cdot x^{5}</math>:</p>
63 | <math display="block">\frac{ab \cdot x^{5} + x^{2} + 2 \cdot x + 123}{t}</math>
64 | ```
65 | **Input:**
66 | ```html
67 | {equation}
68 | ```
69 | """
70 |
71 | def inference_blocks(self, document: Document) -> List[BlockData]:
72 | blocks = super().inference_blocks(document)
73 | out_blocks = []
74 | for block_data in blocks:
75 | block = block_data["block"]
76 | page = block_data["page"]
77 |
78 | # If we redo inline math, we redo all equations
79 | if all([
80 | block.polygon.height / page.polygon.height < self.min_equation_height,
81 | not self.redo_inline_math
82 | ]):
83 | continue
84 | out_blocks.append(block_data)
85 | return out_blocks
86 |
87 | def block_prompts(self, document: Document) -> List[PromptData]:
88 | prompt_data = []
89 | for block_data in self.inference_blocks(document):
90 | block = block_data["block"]
91 | text = block.html if block.html else block.raw_text(document)
92 | prompt = self.equation_latex_prompt.replace("{equation}", text)
93 | image = self.extract_image(document, block)
94 |
95 | prompt_data.append({
96 | "prompt": prompt,
97 | "image": image,
98 | "block": block,
99 | "schema": EquationSchema,
100 | "page": block_data["page"]
101 | })
102 |
103 | return prompt_data
104 |
105 |
106 | def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document):
107 | block = prompt_data["block"]
108 | text = block.html if block.html else block.raw_text(document)
109 |
110 | if not response or "corrected_equation" not in response:
111 | block.update_metadata(llm_error_count=1)
112 | return
113 |
114 | html_equation = response["corrected_equation"]
115 |
116 | if "no corrections needed" in html_equation.lower():
117 | return
118 |
119 | balanced_tags = html_equation.count("<math") == html_equation.count("</math>")
120 | if not all([
121 | html_equation,
122 | balanced_tags,
123 | len(html_equation) > len(text) * .3,
124 | ]):
125 | block.update_metadata(llm_error_count=1)
126 | return
127 |
128 | block.html = html_equation
129 |
130 | class EquationSchema(BaseModel):
131 | analysis: str
132 | corrected_equation: str
```
--------------------------------------------------------------------------------
/marker/builders/structure.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Annotated
2 |
3 | from marker.builders import BaseBuilder
4 | from marker.schema import BlockTypes
5 | from marker.schema.blocks import Text
6 | from marker.schema.document import Document
7 | from marker.schema.groups import ListGroup
8 | from marker.schema.groups.page import PageGroup
9 | from marker.schema.registry import get_block_class
10 |
11 |
12 | class StructureBuilder(BaseBuilder):
13 | """
14 | A builder for grouping blocks together based on their structure.
15 | """
16 | gap_threshold: Annotated[
17 | float,
18 | "The minimum gap between blocks to consider them part of the same group.",
19 | ] = 0.05
20 | list_gap_threshold: Annotated[
21 | float,
22 | "The minimum gap between list items to consider them part of the same group.",
23 | ] = 0.1
24 |
25 | def __init__(self, config=None):
26 | super().__init__(config)
27 |
28 | def __call__(self, document: Document):
29 | for page in document.pages:
30 | self.group_caption_blocks(page)
31 | self.group_lists(page)
32 | self.unmark_lists(page)
33 |
34 | def group_caption_blocks(self, page: PageGroup):
35 | gap_threshold_px = self.gap_threshold * page.polygon.height
36 | static_page_structure = page.structure.copy()
37 | remove_ids = list()
38 |
39 | for i, block_id in enumerate(static_page_structure):
40 | block = page.get_block(block_id)
41 | if block.block_type not in [BlockTypes.Table, BlockTypes.Figure, BlockTypes.Picture]:
42 | continue
43 |
44 | if block.id in remove_ids:
45 | continue
46 |
47 | block_structure = [block_id]
48 | selected_polygons = [block.polygon]
49 | caption_types = [BlockTypes.Caption, BlockTypes.Footnote]
50 |
51 | prev_block = page.get_prev_block(block)
52 | next_block = page.get_next_block(block)
53 |
54 | if prev_block and \
55 | prev_block.block_type in caption_types and \
56 | prev_block.polygon.minimum_gap(block.polygon) < gap_threshold_px and \
57 | prev_block.id not in remove_ids:
58 | block_structure.insert(0, prev_block.id)
59 | selected_polygons.append(prev_block.polygon)
60 |
61 | if next_block and \
62 | next_block.block_type in caption_types and \
63 | next_block.polygon.minimum_gap(block.polygon) < gap_threshold_px:
64 | block_structure.append(next_block.id)
65 | selected_polygons.append(next_block.polygon)
66 |
67 | if len(block_structure) > 1:
68 | # Create a merged block
69 | new_block_cls = get_block_class(BlockTypes[block.block_type.name + "Group"])
70 | new_polygon = block.polygon.merge(selected_polygons)
71 | group_block = page.add_block(new_block_cls, new_polygon)
72 | group_block.structure = block_structure
73 |
74 | # Update the structure of the page to reflect the new block
75 | page.update_structure_item(block_id, group_block.id)
76 | remove_ids.extend(block_structure)
77 | page.remove_structure_items(remove_ids)
78 |
79 | def group_lists(self, page: PageGroup):
80 | gap_threshold_px = self.list_gap_threshold * page.polygon.height
81 | static_page_structure = page.structure.copy()
82 | remove_ids = list()
83 | for i, block_id in enumerate(static_page_structure):
84 | block = page.get_block(block_id)
85 | if block.block_type not in [BlockTypes.ListItem]:
86 | continue
87 |
88 | if block.id in remove_ids:
89 | continue
90 |
91 | block_structure = [block_id]
92 | selected_polygons = [block.polygon]
93 |
94 | for j, next_block_id in enumerate(page.structure[i + 1:]):
95 | next_block = page.get_block(next_block_id)
96 | if all([
97 | next_block.block_type == BlockTypes.ListItem,
98 | next_block.polygon.minimum_gap(selected_polygons[-1]) < gap_threshold_px
99 | ]):
100 | block_structure.append(next_block_id)
101 | selected_polygons.append(next_block.polygon)
102 | else:
103 | break
104 |
105 | if len(block_structure) > 1:
106 | new_polygon = block.polygon.merge(selected_polygons)
107 | group_block = page.add_block(ListGroup, new_polygon)
108 | group_block.structure = block_structure
109 |
110 | # Update the structure of the page to reflect the new block
111 | page.update_structure_item(block_id, group_block.id)
112 | remove_ids.extend(block_structure)
113 |
114 | page.remove_structure_items(remove_ids)
115 |
116 | def unmark_lists(self, page: PageGroup):
117 | # If lists aren't grouped, unmark them as list items
118 | for block_id in page.structure:
119 | block = page.get_block(block_id)
120 | if block.block_type == BlockTypes.ListItem:
121 | generated_block = Text(
122 | polygon=block.polygon,
123 | page_id=block.page_id,
124 | structure=block.structure,
125 | )
126 | page.replace_block(block, generated_block)
127 |
```
--------------------------------------------------------------------------------
/marker/services/gemini.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | import time
3 | import traceback
4 | from io import BytesIO
5 | from typing import List, Annotated
6 |
7 | import PIL
8 | from google import genai
9 | from google.genai import types
10 | from google.genai.errors import APIError
11 | from marker.logger import get_logger
12 | from pydantic import BaseModel
13 |
14 | from marker.schema.blocks import Block
15 | from marker.services import BaseService
16 |
17 | logger = get_logger()
18 |
19 |
20 | class BaseGeminiService(BaseService):
21 | gemini_model_name: Annotated[
22 | str, "The name of the Google model to use for the service."
23 | ] = "gemini-2.0-flash"
24 | thinking_budget: Annotated[
25 | int, "The thinking token budget to use for the service."
26 | ] = None
27 |
28 | def img_to_bytes(self, img: PIL.Image.Image):
29 | image_bytes = BytesIO()
30 | img.save(image_bytes, format="WEBP")
31 | return image_bytes.getvalue()
32 |
33 | def get_google_client(self, timeout: int):
34 | raise NotImplementedError
35 |
36 | def process_images(self, images):
37 | image_parts = [
38 | types.Part.from_bytes(data=self.img_to_bytes(img), mime_type="image/webp")
39 | for img in images
40 | ]
41 | return image_parts
42 |
43 | def __call__(
44 | self,
45 | prompt: str,
46 | image: PIL.Image.Image | List[PIL.Image.Image] | None,
47 | block: Block | None,
48 | response_schema: type[BaseModel],
49 | max_retries: int | None = None,
50 | timeout: int | None = None,
51 | ):
52 | if max_retries is None:
53 | max_retries = self.max_retries
54 |
55 | if timeout is None:
56 | timeout = self.timeout
57 |
58 | client = self.get_google_client(timeout=timeout)
59 | image_parts = self.format_image_for_llm(image)
60 |
61 | total_tries = max_retries + 1
62 | temperature = 0
63 | for tries in range(1, total_tries + 1):
64 | config = {
65 | "temperature": temperature,
66 | "response_schema": response_schema,
67 | "response_mime_type": "application/json",
68 | }
69 | if self.max_output_tokens:
70 | config["max_output_tokens"] = self.max_output_tokens
71 |
72 | if self.thinking_budget is not None:
73 | # For gemini models, we can optionally set a thinking budget in the config
74 | config["thinking_config"] = types.ThinkingConfig(
75 | thinking_budget=self.thinking_budget
76 | )
77 |
78 | try:
79 | responses = client.models.generate_content(
80 | model=self.gemini_model_name,
81 | contents=image_parts
82 | + [
83 | prompt
84 | ], # According to gemini docs, it performs better if the image is the first element
85 | config=config,
86 | )
87 | output = responses.candidates[0].content.parts[0].text
88 | total_tokens = responses.usage_metadata.total_token_count
89 | if block:
90 | block.update_metadata(
91 | llm_tokens_used=total_tokens, llm_request_count=1
92 | )
93 | return json.loads(output)
94 | except APIError as e:
95 | if e.code in [429, 443, 503]:
96 | # Rate limit exceeded
97 | if tries == total_tries:
98 | # Last attempt failed. Give up
99 | logger.error(
100 | f"APIError: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})",
101 | )
102 | break
103 | else:
104 | wait_time = tries * self.retry_wait_time
105 | logger.warning(
106 | f"APIError: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})",
107 | )
108 | time.sleep(wait_time)
109 | else:
110 | logger.error(f"APIError: {e}")
111 | break
112 | except json.JSONDecodeError as e:
113 | temperature = 0.2 # Increase temperature slightly to try and get a different respons
114 |
115 | # The response was not valid JSON
116 | if tries == total_tries:
117 | # Last attempt failed. Give up
118 | logger.error(
119 | f"JSONDecodeError: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})",
120 | )
121 | break
122 | else:
123 | logger.warning(
124 | f"JSONDecodeError: {e}. Retrying... (Attempt {tries}/{total_tries})",
125 | )
126 | except Exception as e:
127 | logger.error(f"Exception: {e}")
128 | traceback.print_exc()
129 | break
130 |
131 | return {}
132 |
133 |
134 | class GoogleGeminiService(BaseGeminiService):
135 | gemini_api_key: Annotated[str, "The Google API key to use for the service."] = None
136 |
137 | def get_google_client(self, timeout: int):
138 | return genai.Client(
139 | api_key=self.gemini_api_key,
140 | http_options={"timeout": timeout * 1000}, # Convert to milliseconds
141 | )
142 |
```
--------------------------------------------------------------------------------
/marker/processors/line_numbers.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Annotated
2 |
3 | from marker.processors import BaseProcessor
4 | from marker.schema import BlockTypes
5 | from marker.schema.document import Document
6 |
7 |
8 | class LineNumbersProcessor(BaseProcessor):
9 | """
10 | A processor for ignoring line numbers.
11 | """
12 | block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
13 | strip_numbers_threshold: Annotated[
14 | float,
15 | "The fraction of lines or tokens in a block that must be numeric to consider them as line numbers.",
16 | ] = 0.6
17 | min_lines_in_block: Annotated[
18 | int,
19 | "The minimum number of lines required in a block for it to be considered during processing.",
20 | "Ensures that small blocks are ignored as they are unlikely to contain meaningful line numbers.",
21 | ] = 4
22 | min_line_length: Annotated[
23 | int,
24 | "The minimum length of a line (in characters) to consider it significant when checking for",
25 | "numeric prefixes or suffixes. Prevents false positives for short lines.",
26 | ] = 10
27 | min_line_number_span_ratio: Annotated[
28 | float,
29 | "The minimum ratio of detected line number spans to total lines required to treat them as line numbers.",
30 | ] = .6
31 |
32 | def __init__(self, config):
33 | super().__init__(config)
34 |
35 | def __call__(self, document: Document):
36 | self.ignore_line_number_spans(document)
37 | self.ignore_line_starts_ends(document)
38 | self.ignore_line_number_blocks(document)
39 |
40 | def ignore_line_number_spans(self, document: Document):
41 | for page in document.pages:
42 | line_count = 0
43 | line_number_spans = []
44 | for block in page.contained_blocks(document, (BlockTypes.Line,)):
45 | if block.structure is None:
46 | continue
47 |
48 | line_count += 1
49 | leftmost_span = None
50 | for span in block.contained_blocks(document, (BlockTypes.Span,)):
51 | if leftmost_span is None or span.polygon.x_start < leftmost_span.polygon.x_start:
52 | leftmost_span = span
53 |
54 | if leftmost_span is not None and leftmost_span.text.strip().isnumeric():
55 | line_number_spans.append(leftmost_span)
56 |
57 | if line_count > 0 and len(line_number_spans) / line_count > self.min_line_number_span_ratio:
58 | for span in line_number_spans:
59 | span.ignore_for_output = True
60 |
61 | def ignore_line_number_blocks(self, document: Document):
62 | for page in document.pages:
63 | for block in page.contained_blocks(document, self.block_types):
64 | raw_text = block.raw_text(document)
65 | tokens = raw_text.strip().split()
66 | if len(tokens) < 4:
67 | continue
68 |
69 | tokens_are_numbers = [token.isdigit() for token in tokens]
70 | if all([
71 | sum(tokens_are_numbers) / len(tokens) > self.strip_numbers_threshold,
72 | block.polygon.height > block.polygon.width # Ensure block is taller than it is wide, like vertical page numbers
73 | ]):
74 | block.ignore_for_output = True
75 |
76 | def ignore_line_starts_ends(self, document: Document):
77 | for page in document.pages:
78 | for block in page.contained_blocks(document, self.block_types):
79 | if block.structure is None:
80 | continue
81 |
82 | all_lines = block.structure_blocks(document)
83 | if len(all_lines) < self.min_lines_in_block:
84 | continue
85 |
86 | starts_with_number = []
87 | ends_with_number = []
88 | for line in all_lines:
89 | spans = line.structure_blocks(document)
90 | if len(spans) < 2:
91 | starts_with_number.append(False)
92 | ends_with_number.append(False)
93 | continue
94 |
95 | raw_text = line.raw_text(document)
96 | starts = all([
97 | spans[0].text.strip().isdigit(),
98 | len(raw_text) - len(spans[0].text.strip()) > self.min_line_length
99 | ])
100 |
101 | ends = all([
102 | spans[-1].text.strip().isdigit(),
103 | len(raw_text) - len(spans[-1].text.strip()) > self.min_line_length
104 | ])
105 |
106 | starts_with_number.append(starts)
107 | ends_with_number.append(ends)
108 |
109 | if sum(starts_with_number) / len(starts_with_number) > self.strip_numbers_threshold:
110 | for starts, line in zip(starts_with_number, all_lines):
111 | if starts:
112 | span = page.get_block(line.structure[0])
113 | span.ignore_for_output = True
114 |
115 | if sum(ends_with_number) / len(ends_with_number) > self.strip_numbers_threshold:
116 | for ends, line in zip(ends_with_number, all_lines):
117 | if ends:
118 | span = page.get_block(line.structure[-1])
119 | span.ignore_for_output = True
120 |
```
--------------------------------------------------------------------------------
/marker/scripts/streamlit_app.py:
--------------------------------------------------------------------------------
```python
1 | import os
2 |
3 | from marker.scripts.common import (
4 | load_models,
5 | parse_args,
6 | img_to_html,
7 | get_page_image,
8 | page_count,
9 | )
10 |
11 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
12 | os.environ["IN_STREAMLIT"] = "true"
13 |
14 | from marker.settings import settings
15 | from streamlit.runtime.uploaded_file_manager import UploadedFile
16 |
17 | import re
18 | import tempfile
19 | from typing import Any, Dict
20 |
21 | import streamlit as st
22 | from PIL import Image
23 |
24 | from marker.converters.pdf import PdfConverter
25 | from marker.config.parser import ConfigParser
26 | from marker.output import text_from_rendered
27 |
28 |
29 | def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[str, Any], dict):
30 | config_dict = config_parser.generate_config_dict()
31 | config_dict["pdftext_workers"] = 1
32 | converter_cls = PdfConverter
33 | converter = converter_cls(
34 | config=config_dict,
35 | artifact_dict=model_dict,
36 | processor_list=config_parser.get_processors(),
37 | renderer=config_parser.get_renderer(),
38 | llm_service=config_parser.get_llm_service(),
39 | )
40 | return converter(fname)
41 |
42 |
43 | def markdown_insert_images(markdown, images):
44 | image_tags = re.findall(
45 | r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))',
46 | markdown,
47 | )
48 |
49 | for image in image_tags:
50 | image_markdown = image[0]
51 | image_alt = image[1]
52 | image_path = image[2]
53 | if image_path in images:
54 | markdown = markdown.replace(
55 | image_markdown, img_to_html(images[image_path], image_alt)
56 | )
57 | return markdown
58 |
59 |
60 | st.set_page_config(layout="wide")
61 | col1, col2 = st.columns([0.5, 0.5])
62 |
63 | model_dict = load_models()
64 | cli_options = parse_args()
65 |
66 | st.markdown("""
67 | # Marker Demo
68 |
69 | This app will let you try marker, a PDF or image -> Markdown, HTML, JSON converter. It works with any language, and extracts images, tables, equations, etc.
70 |
71 | Find the project [here](https://github.com/VikParuchuri/marker).
72 | """)
73 |
74 | in_file: UploadedFile = st.sidebar.file_uploader(
75 | "PDF, document, or image file:",
76 | type=["pdf", "png", "jpg", "jpeg", "gif", "pptx", "docx", "xlsx", "html", "epub"],
77 | )
78 |
79 | if in_file is None:
80 | st.stop()
81 |
82 | filetype = in_file.type
83 |
84 | with col1:
85 | page_count = page_count(in_file)
86 | page_number = st.number_input(
87 | f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count
88 | )
89 | pil_image = get_page_image(in_file, page_number)
90 | st.image(pil_image, use_container_width=True)
91 |
92 | page_range = st.sidebar.text_input(
93 | "Page range to parse, comma separated like 0,5-10,20",
94 | value=f"{page_number}-{page_number}",
95 | )
96 | output_format = st.sidebar.selectbox(
97 | "Output format", ["markdown", "json", "html", "chunks"], index=0
98 | )
99 | run_marker = st.sidebar.button("Run Marker")
100 |
101 | use_llm = st.sidebar.checkbox(
102 | "Use LLM", help="Use LLM for higher quality processing", value=False
103 | )
104 | force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
105 | strip_existing_ocr = st.sidebar.checkbox(
106 | "Strip existing OCR",
107 | help="Strip existing OCR text from the PDF and re-OCR.",
108 | value=False,
109 | )
110 | debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
111 | disable_ocr_math = st.sidebar.checkbox(
112 | "Disable math",
113 | help="Disable math in OCR output - no inline math",
114 | value=False,
115 | )
116 |
117 | if not run_marker:
118 | st.stop()
119 |
120 | # Run Marker
121 | with tempfile.TemporaryDirectory() as tmp_dir:
122 | temp_pdf = os.path.join(tmp_dir, "temp.pdf")
123 | with open(temp_pdf, "wb") as f:
124 | f.write(in_file.getvalue())
125 |
126 | cli_options.update(
127 | {
128 | "output_format": output_format,
129 | "page_range": page_range,
130 | "force_ocr": force_ocr,
131 | "debug": debug,
132 | "output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
133 | "use_llm": use_llm,
134 | "strip_existing_ocr": strip_existing_ocr,
135 | "disable_ocr_math": disable_ocr_math,
136 | }
137 | )
138 | config_parser = ConfigParser(cli_options)
139 | rendered = convert_pdf(temp_pdf, config_parser)
140 | page_range = config_parser.generate_config_dict()["page_range"]
141 | first_page = page_range[0] if page_range else 0
142 |
143 | text, ext, images = text_from_rendered(rendered)
144 | with col2:
145 | if output_format == "markdown":
146 | text = markdown_insert_images(text, images)
147 | st.markdown(text, unsafe_allow_html=True)
148 | elif output_format == "json":
149 | st.json(text)
150 | elif output_format == "html":
151 | st.html(text)
152 | elif output_format == "chunks":
153 | st.json(text)
154 |
155 | if debug:
156 | with col1:
157 | debug_data_path = rendered.metadata.get("debug_data_path")
158 | if debug_data_path:
159 | pdf_image_path = os.path.join(debug_data_path, f"pdf_page_{first_page}.png")
160 | img = Image.open(pdf_image_path)
161 | st.image(img, caption="PDF debug image", use_container_width=True)
162 | layout_image_path = os.path.join(
163 | debug_data_path, f"layout_page_{first_page}.png"
164 | )
165 | img = Image.open(layout_image_path)
166 | st.image(img, caption="Layout debug image", use_container_width=True)
167 | st.write("Raw output:")
168 | st.code(text, language=output_format)
169 |
```
--------------------------------------------------------------------------------
/benchmarks/throughput/main.py:
--------------------------------------------------------------------------------
```python
1 | import os
2 | import tempfile
3 | import time
4 | from multiprocessing import get_context
5 | from concurrent.futures import ProcessPoolExecutor
6 | import torch
7 |
8 | import click
9 | import pypdfium2 as pdfium
10 | from tqdm import tqdm
11 |
12 | import datasets
13 |
14 |
15 | def get_next_pdf(ds: datasets.Dataset, i: int):
16 | while True:
17 | pdf = ds[i]["pdf"]
18 | filename = ds[i]["filename"]
19 | if pdf and filename.endswith(".pdf"):
20 | return pdf, filename, i + 1
21 | i += 1
22 | if i >= len(ds):
23 | i = 0
24 |
25 |
26 | def single_batch(
27 | batch_size: int,
28 | num_threads: int,
29 | force_ocr: bool,
30 | quantize: bool,
31 | compile: bool,
32 | worker_id: int,
33 | chunksize: int = 100,
34 | ):
35 | if quantize:
36 | os.environ["RECOGNITION_MODEL_QUANTIZE"] = "true"
37 | if compile:
38 | os.environ["COMPILE_ALL"] = "true"
39 |
40 | for item in [
41 | "DETECTOR_POSTPROCESSING_CPU_WORKERS",
42 | "OPENBLAS_NUM_THREADS",
43 | "PDFTEXT_CPU_WORKERS",
44 | "OMP_NUM_THREADS",
45 | ]:
46 | os.environ[item] = f"{num_threads}"
47 |
48 | torch.set_num_threads(num_threads)
49 |
50 | from marker.converters.pdf import PdfConverter
51 | from marker.models import create_model_dict
52 | from marker.output import text_from_rendered
53 |
54 | ds = datasets.load_dataset("datalab-to/pdfs", split="train")
55 | model_dict = create_model_dict()
56 | torch.cuda.reset_peak_memory_stats()
57 |
58 | times = []
59 | i = 0
60 | pages = 0
61 | chars = 0
62 |
63 | min_time = time.time()
64 | for _ in range(batch_size):
65 | pdf, fname, i = get_next_pdf(ds, i)
66 | print(f"Inferencing {fname} on worker {worker_id}...")
67 |
68 | pdf_doc = pdfium.PdfDocument(pdf)
69 | page_count = len(pdf_doc)
70 | pdf_doc.close()
71 | pages += page_count
72 |
73 | with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
74 | f.write(pdf)
75 | f.flush()
76 | page_range_chunks = list(range(0, page_count, chunksize))
77 | for chunk_start in page_range_chunks:
78 | chunk_end = min(chunk_start + chunksize, page_count)
79 | page_range = list(range(chunk_start, chunk_end))
80 |
81 | block_converter = PdfConverter(
82 | artifact_dict=model_dict,
83 | config={
84 | "disable_tqdm": worker_id > 0,
85 | "page_range": page_range,
86 | "force_ocr": force_ocr,
87 | },
88 | )
89 | start = time.time()
90 | rendered = block_converter(f.name)
91 | markdown, _, _ = text_from_rendered(rendered)
92 | chars += len(markdown)
93 |
94 | total = time.time() - start
95 | times.append(total)
96 |
97 | max_gpu_vram = torch.cuda.max_memory_reserved() / 1024**3
98 | max_time = time.time()
99 | return sum(times), min_time, max_time, max_gpu_vram, pages, chars
100 |
101 |
102 | @click.command(help="Benchmark PDF to MD conversion throughput.")
103 | @click.option("--workers", default=1, help="Number of workers to use.")
104 | @click.option("--batch_size", default=1, help="Batch size for inference.")
105 | @click.option("--force_ocr", is_flag=True, help="Force OCR on all pages.")
106 | @click.option("--quantize", is_flag=True, help="Use quantized model.")
107 | @click.option("--compile", is_flag=True, help="Use compiled model.")
108 | def main(
109 | workers: int,
110 | batch_size: int,
111 | force_ocr: bool,
112 | quantize: bool,
113 | compile: bool,
114 | ):
115 | total_cpus = os.cpu_count()
116 | start = time.time()
117 | current_gpu_vram = torch.cuda.memory_reserved() / 1024**3
118 | with ProcessPoolExecutor(
119 | max_workers=workers, mp_context=get_context("spawn")
120 | ) as executor:
121 | cpus_per_worker = min(8, max(2, total_cpus // workers))
122 | futures = [
123 | executor.submit(
124 | single_batch,
125 | batch_size,
126 | cpus_per_worker,
127 | force_ocr,
128 | quantize,
129 | compile,
130 | i,
131 | )
132 | for i in range(workers)
133 | ]
134 | all_times = []
135 | min_time = None
136 | max_time = time.time()
137 | vrams = []
138 | page_count = 0
139 | char_count = 0
140 | for future in tqdm(futures, desc="Running marker workers..."):
141 | times, min_time_worker, max_time_worker, max_vram, pages, chars = (
142 | future.result()
143 | )
144 | vrams.append(max_vram - current_gpu_vram)
145 | all_times.append(times)
146 | page_count += pages
147 | char_count += chars
148 | min_time = (
149 | min(min_time_worker, min_time)
150 | if min_time is not None
151 | else min_time_worker
152 | )
153 | max_time = max(max_time, max_time_worker)
154 |
155 | end = time.time() - start
156 | all_worker_time = max_time - min_time
157 |
158 | print(f"Average time per worker: {sum(all_times) / len(all_times)}")
159 | print(f"Max time per worker: {max(all_times)}")
160 | print(f"End to end time (counting model loading), all processes: {end}")
161 | print(f"End to end time (no model loading), all processes: {all_worker_time}")
162 | print(f"Total pages: {page_count}")
163 | print(f"Total characters: {char_count}")
164 | print(f"Time per page: {all_worker_time / page_count:.2f}")
165 | print(f"Characters per second: {char_count / all_worker_time:.2f}")
166 | print(f"Max GPU VRAM: {max(vrams):.2f} GB")
167 | print(f"Average GPU VRAM: {sum(vrams) / len(vrams):.2f} GB")
168 |
169 |
170 | if __name__ == "__main__":
171 | main()
172 |
```
--------------------------------------------------------------------------------
/marker/renderers/__init__.py:
--------------------------------------------------------------------------------
```python
1 | import base64
2 | import io
3 | import re
4 | from collections import Counter
5 | from typing import Annotated, Optional, Tuple, Literal
6 |
7 | from bs4 import BeautifulSoup
8 | from pydantic import BaseModel
9 |
10 | from marker.schema import BlockTypes
11 | from marker.schema.blocks.base import BlockId, BlockOutput
12 | from marker.schema.document import Document
13 | from marker.settings import settings
14 | from marker.util import assign_config
15 |
16 |
17 | class BaseRenderer:
18 | image_blocks: Annotated[
19 | Tuple[BlockTypes, ...], "The block types to consider as images."
20 | ] = (BlockTypes.Picture, BlockTypes.Figure)
21 | extract_images: Annotated[bool, "Extract images from the document."] = True
22 | image_extraction_mode: Annotated[
23 | Literal["lowres", "highres"],
24 | "The mode to use for extracting images.",
25 | ] = "highres"
26 | keep_pageheader_in_output: Annotated[
27 | bool, "Keep the page header in the output HTML."
28 | ] = False
29 | keep_pagefooter_in_output: Annotated[
30 | bool, "Keep the page footer in the output HTML."
31 | ] = False
32 | add_block_ids: Annotated[bool, "Whether to add block IDs to the output HTML."] = (
33 | False
34 | )
35 |
36 | def __init__(self, config: Optional[BaseModel | dict] = None):
37 | assign_config(self, config)
38 |
39 | self.block_config = {
40 | "keep_pageheader_in_output": self.keep_pageheader_in_output,
41 | "keep_pagefooter_in_output": self.keep_pagefooter_in_output,
42 | "add_block_ids": self.add_block_ids,
43 | }
44 |
45 | def __call__(self, document):
46 | # Children are in reading order
47 | raise NotImplementedError
48 |
49 | def extract_image(self, document: Document, image_id, to_base64=False):
50 | image_block = document.get_block(image_id)
51 | cropped = image_block.get_image(
52 | document, highres=self.image_extraction_mode == "highres"
53 | )
54 |
55 | if to_base64:
56 | image_buffer = io.BytesIO()
57 | # RGBA to RGB
58 | if not cropped.mode == "RGB":
59 | cropped = cropped.convert("RGB")
60 |
61 | cropped.save(image_buffer, format=settings.OUTPUT_IMAGE_FORMAT)
62 | cropped = base64.b64encode(image_buffer.getvalue()).decode(
63 | settings.OUTPUT_ENCODING
64 | )
65 | return cropped
66 |
67 | @staticmethod
68 | def merge_consecutive_math(html, tag="math"):
69 | if not html:
70 | return html
71 | pattern = rf"-</{tag}>(\s*)<{tag}>"
72 | html = re.sub(pattern, " ", html)
73 |
74 | pattern = rf'-</{tag}>(\s*)<{tag} display="inline">'
75 | html = re.sub(pattern, " ", html)
76 | return html
77 |
78 | @staticmethod
79 | def merge_consecutive_tags(html, tag):
80 | if not html:
81 | return html
82 |
83 | def replace_whitespace(match):
84 | whitespace = match.group(1)
85 | if len(whitespace) == 0:
86 | return ""
87 | else:
88 | return " "
89 |
90 | pattern = rf"</{tag}>(\s*)<{tag}>"
91 |
92 | while True:
93 | new_merged = re.sub(pattern, replace_whitespace, html)
94 | if new_merged == html:
95 | break
96 | html = new_merged
97 |
98 | return html
99 |
100 | def generate_page_stats(self, document: Document, document_output):
101 | page_stats = []
102 | for page in document.pages:
103 | block_counts = Counter(
104 | [str(block.block_type) for block in page.children]
105 | ).most_common()
106 | block_metadata = page.aggregate_block_metadata()
107 | page_stats.append(
108 | {
109 | "page_id": page.page_id,
110 | "text_extraction_method": page.text_extraction_method,
111 | "block_counts": block_counts,
112 | "block_metadata": block_metadata.model_dump(),
113 | }
114 | )
115 | return page_stats
116 |
117 | def generate_document_metadata(self, document: Document, document_output):
118 | metadata = {
119 | "table_of_contents": document.table_of_contents,
120 | "page_stats": self.generate_page_stats(document, document_output),
121 | }
122 | if document.debug_data_path is not None:
123 | metadata["debug_data_path"] = document.debug_data_path
124 |
125 | return metadata
126 |
127 | def extract_block_html(self, document: Document, block_output: BlockOutput):
128 | soup = BeautifulSoup(block_output.html, "html.parser")
129 |
130 | content_refs = soup.find_all("content-ref")
131 | ref_block_id = None
132 | images = {}
133 | for ref in content_refs:
134 | src = ref.get("src")
135 | sub_images = {}
136 | for item in block_output.children:
137 | if item.id == src:
138 | content, sub_images_ = self.extract_block_html(document, item)
139 | sub_images.update(sub_images_)
140 | ref_block_id: BlockId = item.id
141 | break
142 |
143 | if ref_block_id.block_type in self.image_blocks and self.extract_images:
144 | images[ref_block_id] = self.extract_image(
145 | document, ref_block_id, to_base64=True
146 | )
147 | else:
148 | images.update(sub_images)
149 | ref.replace_with(BeautifulSoup(content, "html.parser"))
150 |
151 | if block_output.id.block_type in self.image_blocks and self.extract_images:
152 | images[block_output.id] = self.extract_image(
153 | document, block_output.id, to_base64=True
154 | )
155 |
156 | return str(soup), images
157 |
```
--------------------------------------------------------------------------------
/marker/processors/llm/llm_sectionheader.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | from typing import List, Tuple
3 |
4 | from tqdm import tqdm
5 |
6 | from marker.logger import get_logger
7 | from marker.processors.llm import BaseLLMComplexBlockProcessor
8 | from marker.schema import BlockTypes
9 | from marker.schema.blocks import Block
10 | from marker.schema.document import Document
11 | from marker.schema.groups import PageGroup
12 | from pydantic import BaseModel
13 |
14 | logger = get_logger()
15 |
16 |
17 | class LLMSectionHeaderProcessor(BaseLLMComplexBlockProcessor):
18 | page_prompt = """You're a text correction expert specializing in accurately analyzing complex PDF documents. You will be given a list of all of the section headers from a document, along with their page number and approximate dimensions. The headers will be formatted like below, and will be presented in order.
19 |
20 | ```json
21 | [
22 | {
23 | "bbox": [x1, y1, x2, y2],
24 | "width": x2 - x1,
25 | "height": y2 - y1,
26 | "page": 0,
27 | "id": "/page/0/SectionHeader/1",
28 | "html": "<h1>Introduction</h1>",
29 | }, ...
30 | ]
31 | ```
32 |
33 | Bboxes have been normalized to 0-1000.
34 |
35 | Your goal is to make sure that the section headers have the correct levels (h1, h2, h3, h4, h5, or h6). If a section header does not have the right level, edit the html to fix it.
36 |
37 | Guidelines:
38 | - Edit the blocks to ensure that the section headers have the correct levels.
39 | - Only edit the h1, h2, h3, h4, h5, and h6 tags. Do not change any other tags or content in the headers.
40 | - Only output the headers that changed (if nothing changed, output nothing).
41 | - Every header you output needs to have one and only one level tag (h1, h2, h3, h4, h5, or h6).
42 |
43 | **Instructions:**
44 | 1. Carefully examine the provided section headers and JSON.
45 | 2. Identify any changes you'll need to make, and write a short analysis.
46 | 3. Output "no_corrections", or "corrections_needed", depending on whether you need to make changes.
47 | 4. If corrections are needed, output any blocks that need updates. Only output the block ids and html, like this:
48 | ```json
49 | [
50 | {
51 | "id": "/page/0/SectionHeader/1",
52 | "html": "<h2>Introduction</h2>"
53 | },
54 | ...
55 | ]
56 | ```
57 |
58 | **Example:**
59 | Input:
60 | Section Headers
61 | ```json
62 | [
63 | {
64 | "bbox": [x1, y1, x2, y2],
65 | "id": "/page/0/SectionHeader/1",
66 | "page": 0,
67 | "html": "1 Vector Operations",
68 | },
69 | {
70 | "bbox": [x1, y1, x2, y2],
71 | "id": "/page/0/SectionHeader/2",
72 | "page": 0,
73 | "html": "1.1 Vector Addition",
74 | },
75 | ]
76 | ```
77 | Output:
78 | Analysis: The first section header is missing the h1 tag, and the second section header is missing the h2 tag.
79 | ```json
80 | [
81 | {
82 | "id": "/page/0/SectionHeader/1",
83 | "html": "<h1>1 Vector Operations</h1>"
84 | },
85 | {
86 | "id": "/page/0/SectionHeader/2",
87 | "html": "<h2>1.1 Vector Addition</h2>"
88 | }
89 | ]
90 | ```
91 |
92 | **Input:**
93 | Section Headers
94 | ```json
95 | {{section_header_json}}
96 | ```
97 | """
98 |
99 | def get_selected_blocks(
100 | self,
101 | document: Document,
102 | page: PageGroup,
103 | ) -> List[dict]:
104 | selected_blocks = page.structure_blocks(document)
105 | json_blocks = [
106 | self.normalize_block_json(block, document, page, i)
107 | for i, block in enumerate(selected_blocks)
108 | ]
109 | return json_blocks
110 |
111 | def process_rewriting(
112 | self, document: Document, section_headers: List[Tuple[Block, dict]]
113 | ):
114 | section_header_json = [sh[1] for sh in section_headers]
115 | for item in section_header_json:
116 | _, _, page_id, block_type, block_id = item["id"].split("/")
117 | item["page"] = page_id
118 | item["width"] = item["bbox"][2] - item["bbox"][0]
119 | item["height"] = item["bbox"][3] - item["bbox"][1]
120 | del item["block_type"] # Not needed, since they're all section headers
121 |
122 | prompt = self.page_prompt.replace(
123 | "{{section_header_json}}", json.dumps(section_header_json)
124 | )
125 | response = self.llm_service(
126 | prompt, None, document.pages[0], SectionHeaderSchema
127 | )
128 | logger.debug(f"Got section header reponse from LLM: {response}")
129 |
130 | if not response or "correction_type" not in response:
131 | logger.warning("LLM did not return a valid response")
132 | return
133 |
134 | correction_type = response["correction_type"]
135 | if correction_type == "no_corrections":
136 | return
137 |
138 | self.load_blocks(response)
139 | self.handle_rewrites(response["blocks"], document)
140 |
141 | def load_blocks(self, response):
142 | if isinstance(response["blocks"], str):
143 | response["blocks"] = json.loads(response["blocks"])
144 |
145 | def rewrite_blocks(self, document: Document):
146 | # Don't show progress if there are no blocks to process
147 | section_headers = [
148 | (block, self.normalize_block_json(block, document, page))
149 | for page in document.pages
150 | for block in page.structure_blocks(document)
151 | if block.block_type == BlockTypes.SectionHeader
152 | ]
153 | if len(section_headers) == 0:
154 | return
155 |
156 | pbar = tqdm(
157 | total=1,
158 | desc=f"Running {self.__class__.__name__}",
159 | disable=self.disable_tqdm,
160 | )
161 |
162 | self.process_rewriting(document, section_headers)
163 | pbar.update(1)
164 | pbar.close()
165 |
166 |
167 | class BlockSchema(BaseModel):
168 | id: str
169 | html: str
170 |
171 |
172 | class SectionHeaderSchema(BaseModel):
173 | analysis: str
174 | correction_type: str
175 | blocks: List[BlockSchema]
176 |
```
--------------------------------------------------------------------------------
/marker/renderers/html.py:
--------------------------------------------------------------------------------
```python
1 | import textwrap
2 |
3 | from PIL import Image
4 | from typing import Annotated, Tuple
5 |
6 | from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
7 | from pydantic import BaseModel
8 |
9 | from marker.renderers import BaseRenderer
10 | from marker.schema import BlockTypes
11 | from marker.schema.blocks import BlockId
12 | from marker.settings import settings
13 |
14 | # Ignore beautifulsoup warnings
15 | import warnings
16 |
17 | warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
18 |
19 | # Suppress DecompressionBombError
20 | Image.MAX_IMAGE_PIXELS = None
21 |
22 |
23 | class HTMLOutput(BaseModel):
24 | html: str
25 | images: dict
26 | metadata: dict
27 |
28 |
29 | class HTMLRenderer(BaseRenderer):
30 | """
31 | A renderer for HTML output.
32 | """
33 |
34 | page_blocks: Annotated[
35 | Tuple[BlockTypes],
36 | "The block types to consider as pages.",
37 | ] = (BlockTypes.Page,)
38 | paginate_output: Annotated[
39 | bool,
40 | "Whether to paginate the output.",
41 | ] = False
42 |
43 | def extract_image(self, document, image_id):
44 | image_block = document.get_block(image_id)
45 | cropped = image_block.get_image(
46 | document, highres=self.image_extraction_mode == "highres"
47 | )
48 | return cropped
49 |
50 | def insert_block_id(self, soup, block_id: BlockId):
51 | """
52 | Insert a block ID into the soup as a data attribute.
53 | """
54 | if block_id.block_type in [BlockTypes.Line, BlockTypes.Span]:
55 | return soup
56 |
57 | if self.add_block_ids:
58 | # Find the outermost tag (first tag that isn't a NavigableString)
59 | outermost_tag = None
60 | for element in soup.contents:
61 | if hasattr(element, "name") and element.name:
62 | outermost_tag = element
63 | break
64 |
65 | # If we found an outermost tag, add the data-block-id attribute
66 | if outermost_tag:
67 | outermost_tag["data-block-id"] = str(block_id)
68 |
69 | # If soup only contains text or no tags, wrap in a span
70 | elif soup.contents:
71 | wrapper = soup.new_tag("span")
72 | wrapper["data-block-id"] = str(block_id)
73 |
74 | contents = list(soup.contents)
75 | for content in contents:
76 | content.extract()
77 | wrapper.append(content)
78 | soup.append(wrapper)
79 | return soup
80 |
81 | def extract_html(self, document, document_output, level=0):
82 | soup = BeautifulSoup(document_output.html, "html.parser")
83 |
84 | content_refs = soup.find_all("content-ref")
85 | ref_block_id = None
86 | images = {}
87 | for ref in content_refs:
88 | src = ref.get("src")
89 | sub_images = {}
90 | content = ""
91 | for item in document_output.children:
92 | if item.id == src:
93 | content, sub_images_ = self.extract_html(document, item, level + 1)
94 | sub_images.update(sub_images_)
95 | ref_block_id: BlockId = item.id
96 | break
97 |
98 | if ref_block_id.block_type in self.image_blocks:
99 | if self.extract_images:
100 | image = self.extract_image(document, ref_block_id)
101 | image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
102 | images[image_name] = image
103 | element = BeautifulSoup(
104 | f"<p>{content}<img src='{image_name}'></p>", "html.parser"
105 | )
106 | ref.replace_with(self.insert_block_id(element, ref_block_id))
107 | else:
108 | # This will be the image description if using llm mode, or empty if not
109 | element = BeautifulSoup(f"{content}", "html.parser")
110 | ref.replace_with(self.insert_block_id(element, ref_block_id))
111 | elif ref_block_id.block_type in self.page_blocks:
112 | images.update(sub_images)
113 | if self.paginate_output:
114 | content = f"<div class='page' data-page-id='{ref_block_id.page_id}'>{content}</div>"
115 | element = BeautifulSoup(f"{content}", "html.parser")
116 | ref.replace_with(self.insert_block_id(element, ref_block_id))
117 | else:
118 | images.update(sub_images)
119 | element = BeautifulSoup(f"{content}", "html.parser")
120 | ref.replace_with(self.insert_block_id(element, ref_block_id))
121 |
122 | output = str(soup)
123 | if level == 0:
124 | output = self.merge_consecutive_tags(output, "b")
125 | output = self.merge_consecutive_tags(output, "i")
126 | output = self.merge_consecutive_math(
127 | output
128 | ) # Merge consecutive inline math tags
129 | output = textwrap.dedent(f"""
130 | <!DOCTYPE html>
131 | <html>
132 | <head>
133 | <meta charset="utf-8" />
134 | </head>
135 | <body>
136 | {output}
137 | </body>
138 | </html>
139 | """)
140 |
141 | return output, images
142 |
143 | def __call__(self, document) -> HTMLOutput:
144 | document_output = document.render(self.block_config)
145 | full_html, images = self.extract_html(document, document_output)
146 | soup = BeautifulSoup(full_html, "html.parser")
147 | full_html = soup.prettify() # Add indentation to the HTML
148 | return HTMLOutput(
149 | html=full_html,
150 | images=images,
151 | metadata=self.generate_document_metadata(document, document_output),
152 | )
153 |
```
--------------------------------------------------------------------------------
/marker/extractors/page.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | from concurrent.futures import ThreadPoolExecutor
3 |
4 | from pydantic import BaseModel
5 | from typing import Annotated, Optional, List
6 |
7 | from tqdm import tqdm
8 |
9 | from marker.extractors import BaseExtractor
10 | from marker.logger import get_logger
11 |
12 | logger = get_logger()
13 |
14 |
15 | class PageExtractionSchema(BaseModel):
16 | description: str
17 | detailed_notes: str
18 |
19 |
20 | class PageExtractor(BaseExtractor):
21 | """
22 | An extractor that pulls data from a single page.
23 | """
24 |
25 | extraction_page_chunk_size: Annotated[
26 | int, "The number of pages to chunk together for extraction."
27 | ] = 3
28 |
29 | page_schema: Annotated[
30 | str,
31 | "The JSON schema to be extracted from the page.",
32 | ] = ""
33 |
34 | page_extraction_prompt = """You are an expert document analyst who reads documents and pulls data out in JSON format. You will receive the markdown representation of a document page, and a JSON schema that we want to extract from the document. Your task is to write detailed notes on this page, so that when you look at all your notes from across the document, you can fill in the schema.
35 |
36 | Some notes:
37 | - The schema may contain a single object to extract from the entire document, or an array of objects.
38 | - The schema may contain nested objects, arrays, and other complex structures.
39 |
40 | Some guidelines:
41 | - Write very thorough notes, and include specific JSON snippets that can be extracted from the page.
42 | - You may need information from prior or subsequent pages to fully fill in the schema, so make sure to write detailed notes that will let you join entities across pages later on.
43 | - Estimate your confidence in the values you extract, so you can reconstruct the JSON later when you only have your notes.
44 | - Some tables and other data structures may continue on a subsequent page, so make sure to store the positions that data comes from where appropriate.
45 |
46 | **Instructions:**
47 | 1. Analyze the provided markdown representation of the page.
48 | 2. Analyze the JSON schema.
49 | 3. Write a short description of the fields in the schema, and the associated values in the markdown.
50 | 4. Write detailed notes on the page, including any values that can be extracted from the markdown. Include snippets of JSON that can be extracted from the page where possible.
51 |
52 | **Example:**
53 | Input:
54 |
55 | Markdown
56 | ```markdown
57 | | Make | Sales |
58 | |--------|-------|
59 | | Honda | 100 |
60 | | Toyota | 200 |
61 | ```
62 |
63 | Schema
64 |
65 | ```json
66 | {'$defs': {'Cars': {'properties': {'make': {'title': 'Make', 'type': 'string'}, 'sales': {'title': 'Sales', 'type': 'integer'}, 'color': {'title': 'Color', 'type': 'string'}}, 'required': ['make', 'sales', 'color'], 'title': 'Cars', 'type': 'object'}}, 'properties': {'cars': {'items': {'$ref': '#/$defs/Cars'}, 'title': 'Cars', 'type': 'array'}}, 'required': ['cars'], 'title': 'CarsList', 'type': 'object'}
67 | ```
68 |
69 | Output:
70 |
71 | Description: The schema has a list of cars, each with a make, sales, and color. The image and markdown contain a table with 2 cars: Honda with 100 sales and Toyota with 200 sales. The color is not present in the table.
72 | Detailed Notes: On this page, I see a table with car makes and sales. The makes are Honda and Toyota, with sales of 100 and 200 respectively. The color is not present in the table, so I will leave it blank in the JSON. That information may be present on another page. Some JSON snippets I may find useful later are:
73 | ```json
74 | {
75 | "make": "Honda",
76 | "sales": 100,
77 | }
78 | ```
79 | ```json
80 | {
81 | "make": "Toyota",
82 | "sales": 200,
83 | }
84 | ```
85 |
86 | Honda is the first row in the table, and Toyota is the second row. Make is the first column, and sales is the second.
87 |
88 | **Input:**
89 |
90 | Markdown
91 | ```markdown
92 | {{page_md}}
93 | ```
94 |
95 | Schema
96 | ```json
97 | {{schema}}
98 | ```
99 | """
100 |
101 | def chunk_page_markdown(self, page_markdown: List[str]) -> List[str]:
102 | """
103 | Chunk the page markdown into smaller pieces for processing.
104 | """
105 |
106 | chunks = []
107 | for i in range(0, len(page_markdown), self.extraction_page_chunk_size):
108 | chunk = page_markdown[i : i + self.extraction_page_chunk_size]
109 | chunks.append("\n\n".join(chunk))
110 |
111 | return chunks
112 |
113 | def inference_single_chunk(
114 | self, page_markdown: str
115 | ) -> Optional[PageExtractionSchema]:
116 | prompt = self.page_extraction_prompt.replace(
117 | "{{page_md}}", page_markdown
118 | ).replace("{{schema}}", json.dumps(self.page_schema))
119 | response = self.llm_service(prompt, None, None, PageExtractionSchema)
120 | logger.debug(f"Page extraction response: {response}")
121 |
122 | if not response or any(
123 | [
124 | key not in response
125 | for key in [
126 | "description",
127 | "detailed_notes",
128 | ]
129 | ]
130 | ):
131 | return None
132 |
133 | return PageExtractionSchema(
134 | description=response["description"],
135 | detailed_notes=response["detailed_notes"],
136 | )
137 |
138 | def __call__(
139 | self,
140 | page_markdown: List[str],
141 | **kwargs,
142 | ) -> List[PageExtractionSchema]:
143 | if not self.page_schema:
144 | raise ValueError(
145 | "Page schema must be defined for structured extraction to work."
146 | )
147 |
148 | chunks = self.chunk_page_markdown(page_markdown)
149 | results = []
150 | pbar = tqdm(
151 | desc="Running page extraction",
152 | disable=self.disable_tqdm,
153 | total=len(chunks),
154 | )
155 |
156 | with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
157 | for future in [
158 | executor.submit(self.inference_single_chunk, chunk) for chunk in chunks
159 | ]:
160 | results.append(future.result()) # Raise exceptions if any occurred
161 | pbar.update(1)
162 |
163 | pbar.close()
164 | return results
165 |
```
--------------------------------------------------------------------------------
/marker/scripts/common.py:
--------------------------------------------------------------------------------
```python
1 | import ast
2 | import base64
3 | import io
4 | import re
5 | import sys
6 | from typing import Optional
7 |
8 | from PIL import Image
9 | import click
10 | import pypdfium2
11 | import streamlit as st
12 | from pydantic import BaseModel
13 | from streamlit.runtime.uploaded_file_manager import UploadedFile
14 |
15 | from marker.config.parser import ConfigParser
16 | from marker.config.printer import CustomClickPrinter
17 | from marker.models import create_model_dict
18 | from marker.settings import settings
19 |
20 |
21 | @st.cache_data()
22 | def parse_args():
23 | # Use to grab common cli options
24 | @ConfigParser.common_options
25 | def options_func():
26 | pass
27 |
28 | def extract_click_params(decorated_function):
29 | if hasattr(decorated_function, "__click_params__"):
30 | return decorated_function.__click_params__
31 | return []
32 |
33 | cmd = CustomClickPrinter("Marker app.")
34 | extracted_params = extract_click_params(options_func)
35 | cmd.params.extend(extracted_params)
36 | ctx = click.Context(cmd)
37 | try:
38 | cmd_args = sys.argv[1:]
39 | cmd.parse_args(ctx, cmd_args)
40 | return ctx.params
41 | except click.exceptions.ClickException as e:
42 | return {"error": str(e)}
43 |
44 |
45 | @st.cache_resource()
46 | def load_models():
47 | return create_model_dict()
48 |
49 |
50 | def open_pdf(pdf_file):
51 | stream = io.BytesIO(pdf_file.getvalue())
52 | return pypdfium2.PdfDocument(stream)
53 |
54 |
55 | def img_to_html(img, img_alt):
56 | img_bytes = io.BytesIO()
57 | img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT)
58 | img_bytes = img_bytes.getvalue()
59 | encoded = base64.b64encode(img_bytes).decode()
60 | img_html = f'<img src="data:image/{settings.OUTPUT_IMAGE_FORMAT.lower()};base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
61 | return img_html
62 |
63 |
64 | @st.cache_data()
65 | def get_page_image(pdf_file, page_num, dpi=96):
66 | if "pdf" in pdf_file.type:
67 | doc = open_pdf(pdf_file)
68 | page = doc[page_num]
69 | png_image = (
70 | page.render(
71 | scale=dpi / 72,
72 | )
73 | .to_pil()
74 | .convert("RGB")
75 | )
76 | else:
77 | png_image = Image.open(pdf_file).convert("RGB")
78 | return png_image
79 |
80 |
81 | @st.cache_data()
82 | def page_count(pdf_file: UploadedFile):
83 | if "pdf" in pdf_file.type:
84 | doc = open_pdf(pdf_file)
85 | return len(doc) - 1
86 | else:
87 | return 1
88 |
89 |
90 | def pillow_image_to_base64_string(img: Image) -> str:
91 | buffered = io.BytesIO()
92 | img.save(buffered, format="JPEG")
93 | return base64.b64encode(buffered.getvalue()).decode("utf-8")
94 |
95 |
96 | def extract_root_pydantic_class(schema_code: str) -> Optional[str]:
97 | try:
98 | # Parse the code into an AST
99 | tree = ast.parse(schema_code)
100 |
101 | # Find all class definitions that inherit from BaseModel
102 | class_names = set()
103 | class_info = {} # Store information about each class
104 |
105 | for node in ast.walk(tree):
106 | if isinstance(node, ast.ClassDef):
107 | # Check if this class inherits from BaseModel
108 | is_pydantic = False
109 | for base in node.bases:
110 | if isinstance(base, ast.Name) and base.id == "BaseModel":
111 | is_pydantic = True
112 | break
113 |
114 | if is_pydantic:
115 | class_names.add(node.name)
116 | class_info[node.name] = {
117 | "references": set(), # Classes this class references
118 | "fields": [], # Field names in this class
119 | }
120 |
121 | # Extract field information
122 | for item in node.body:
123 | if isinstance(item, ast.AnnAssign) and isinstance(
124 | item.target, ast.Name
125 | ):
126 | field_name = item.target.id
127 | class_info[node.name]["fields"].append(field_name)
128 |
129 | # Check if this field references another class
130 | annotation_str = ast.unparse(item.annotation)
131 |
132 | # Look for List[ClassName], Optional[ClassName], Dict[Any, ClassName], etc.
133 | for other_class in class_names:
134 | pattern = rf"(?:List|Dict|Set|Tuple|Optional|Union)?\[.*{other_class}.*\]|{other_class}"
135 | if re.search(pattern, annotation_str):
136 | class_info[node.name]["references"].add(other_class)
137 |
138 | if len(class_names) == 1:
139 | return list(class_names)[0]
140 |
141 | referenced_classes = set()
142 | for class_name, info in class_info.items():
143 | referenced_classes.update(info["references"])
144 |
145 | # Find classes that reference others but aren't referenced themselves (potential roots)
146 | root_candidates = set()
147 | for class_name, info in class_info.items():
148 | if info["references"] and class_name not in referenced_classes:
149 | root_candidates.add(class_name)
150 |
151 | # If we found exactly one root candidate, return it
152 | if len(root_candidates) == 1:
153 | return list(root_candidates)[0]
154 |
155 | return None
156 | except Exception as e:
157 | print(f"Error parsing schema: {e}")
158 | return None
159 |
160 |
161 | def get_root_class(schema_code: str) -> Optional[BaseModel]:
162 | root_class_name = extract_root_pydantic_class(schema_code)
163 |
164 | if not root_class_name:
165 | return None
166 |
167 | if "from pydantic" not in schema_code:
168 | schema_code = "from pydantic import BaseModel\n" + schema_code
169 | if "from typing" not in schema_code:
170 | schema_code = (
171 | "from typing import List, Dict, Optional, Set, Tuple, Union, Any\n\n"
172 | + schema_code
173 | )
174 |
175 | # Execute the code in a new namespace
176 | namespace = {}
177 | exec(schema_code, namespace)
178 |
179 | # Return the root class object
180 | return namespace.get(root_class_name)
181 |
```
--------------------------------------------------------------------------------
/marker/config/parser.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | import os
3 | from typing import Dict
4 |
5 | import click
6 |
7 | from marker.converters.pdf import PdfConverter
8 | from marker.logger import get_logger
9 | from marker.renderers.chunk import ChunkRenderer
10 | from marker.renderers.html import HTMLRenderer
11 | from marker.renderers.json import JSONRenderer
12 | from marker.renderers.markdown import MarkdownRenderer
13 | from marker.settings import settings
14 | from marker.util import classes_to_strings, parse_range_str, strings_to_classes
15 |
16 | logger = get_logger()
17 |
18 |
19 | class ConfigParser:
20 | def __init__(self, cli_options: dict):
21 | self.cli_options = cli_options
22 |
23 | @staticmethod
24 | def common_options(fn):
25 | fn = click.option(
26 | "--output_dir",
27 | type=click.Path(exists=False),
28 | required=False,
29 | default=settings.OUTPUT_DIR,
30 | help="Directory to save output.",
31 | )(fn)
32 | fn = click.option("--debug", "-d", is_flag=True, help="Enable debug mode.")(fn)
33 | fn = click.option(
34 | "--output_format",
35 | type=click.Choice(["markdown", "json", "html", "chunks"]),
36 | default="markdown",
37 | help="Format to output results in.",
38 | )(fn)
39 | fn = click.option(
40 | "--processors",
41 | type=str,
42 | default=None,
43 | help="Comma separated list of processors to use. Must use full module path.",
44 | )(fn)
45 | fn = click.option(
46 | "--config_json",
47 | type=str,
48 | default=None,
49 | help="Path to JSON file with additional configuration.",
50 | )(fn)
51 | fn = click.option(
52 | "--disable_multiprocessing",
53 | is_flag=True,
54 | default=False,
55 | help="Disable multiprocessing.",
56 | )(fn)
57 | fn = click.option(
58 | "--disable_image_extraction",
59 | is_flag=True,
60 | default=False,
61 | help="Disable image extraction.",
62 | )(fn)
63 | # these are options that need a list transformation, i.e splitting/parsing a string
64 | fn = click.option(
65 | "--page_range",
66 | type=str,
67 | default=None,
68 | help="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20",
69 | )(fn)
70 |
71 | # we put common options here
72 | fn = click.option(
73 | "--converter_cls",
74 | type=str,
75 | default=None,
76 | help="Converter class to use. Defaults to PDF converter.",
77 | )(fn)
78 | fn = click.option(
79 | "--llm_service",
80 | type=str,
81 | default=None,
82 | help="LLM service to use - should be full import path, like marker.services.gemini.GoogleGeminiService",
83 | )(fn)
84 | return fn
85 |
86 | def generate_config_dict(self) -> Dict[str, any]:
87 | config = {}
88 | output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR)
89 | for k, v in self.cli_options.items():
90 | if not v:
91 | continue
92 |
93 | match k:
94 | case "debug":
95 | config["debug_pdf_images"] = True
96 | config["debug_layout_images"] = True
97 | config["debug_json"] = True
98 | config["debug_data_folder"] = output_dir
99 | case "page_range":
100 | config["page_range"] = parse_range_str(v)
101 | case "config_json":
102 | with open(v, "r", encoding="utf-8") as f:
103 | config.update(json.load(f))
104 | case "disable_multiprocessing":
105 | config["pdftext_workers"] = 1
106 | case "disable_image_extraction":
107 | config["extract_images"] = False
108 | case _:
109 | config[k] = v
110 |
111 | # Backward compatibility for google_api_key
112 | if settings.GOOGLE_API_KEY:
113 | config["gemini_api_key"] = settings.GOOGLE_API_KEY
114 |
115 | return config
116 |
117 | def get_llm_service(self):
118 | # Only return an LLM service when use_llm is enabled
119 | if not self.cli_options.get("use_llm", False):
120 | return None
121 |
122 | service_cls = self.cli_options.get("llm_service", None)
123 | if service_cls is None:
124 | service_cls = "marker.services.gemini.GoogleGeminiService"
125 | return service_cls
126 |
127 | def get_renderer(self):
128 | match self.cli_options["output_format"]:
129 | case "json":
130 | r = JSONRenderer
131 | case "markdown":
132 | r = MarkdownRenderer
133 | case "html":
134 | r = HTMLRenderer
135 | case "chunks":
136 | r = ChunkRenderer
137 | case _:
138 | raise ValueError("Invalid output format")
139 | return classes_to_strings([r])[0]
140 |
141 | def get_processors(self):
142 | processors = self.cli_options.get("processors", None)
143 | if processors is not None:
144 | processors = processors.split(",")
145 | for p in processors:
146 | try:
147 | strings_to_classes([p])
148 | except Exception as e:
149 | logger.error(f"Error loading processor: {p} with error: {e}")
150 | raise
151 |
152 | return processors
153 |
154 | def get_converter_cls(self):
155 | converter_cls = self.cli_options.get("converter_cls", None)
156 | if converter_cls is not None:
157 | try:
158 | return strings_to_classes([converter_cls])[0]
159 | except Exception as e:
160 | logger.error(
161 | f"Error loading converter: {converter_cls} with error: {e}"
162 | )
163 | raise
164 |
165 | return PdfConverter
166 |
167 | def get_output_folder(self, filepath: str):
168 | output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR)
169 | fname_base = os.path.splitext(os.path.basename(filepath))[0]
170 | output_dir = os.path.join(output_dir, fname_base)
171 | os.makedirs(output_dir, exist_ok=True)
172 | return output_dir
173 |
174 | def get_base_filename(self, filepath: str):
175 | basename = os.path.basename(filepath)
176 | return os.path.splitext(basename)[0]
177 |
```
--------------------------------------------------------------------------------
/marker/builders/layout.py:
--------------------------------------------------------------------------------
```python
1 | from typing import Annotated, List
2 |
3 | from surya.layout import LayoutPredictor
4 | from surya.layout.schema import LayoutResult, LayoutBox
5 |
6 | from marker.builders import BaseBuilder
7 | from marker.providers.pdf import PdfProvider
8 | from marker.schema import BlockTypes
9 | from marker.schema.document import Document
10 | from marker.schema.groups.page import PageGroup
11 | from marker.schema.polygon import PolygonBox
12 | from marker.schema.registry import get_block_class
13 | from marker.settings import settings
14 |
15 |
16 | class LayoutBuilder(BaseBuilder):
17 | """
18 | A builder for performing layout detection on PDF pages and merging the results into the document.
19 | """
20 |
21 | layout_batch_size: Annotated[
22 | int,
23 | "The batch size to use for the layout model.",
24 | "Default is None, which will use the default batch size for the model.",
25 | ] = None
26 | force_layout_block: Annotated[
27 | str,
28 | "Skip layout and force every page to be treated as a specific block type.",
29 | ] = None
30 | disable_tqdm: Annotated[
31 | bool,
32 | "Disable tqdm progress bars.",
33 | ] = False
34 | expand_block_types: Annotated[
35 | List[BlockTypes],
36 | "Block types whose bounds should be expanded to accomodate missing regions",
37 | ] = [
38 | BlockTypes.Picture,
39 | BlockTypes.Figure,
40 | BlockTypes.ComplexRegion,
41 | ] # Does not include groups since they are only injected later
42 | max_expand_frac: Annotated[
43 | float, "The maximum fraction to expand the layout box bounds by"
44 | ] = 0.05
45 |
46 | def __init__(self, layout_model: LayoutPredictor, config=None):
47 | self.layout_model = layout_model
48 |
49 | super().__init__(config)
50 |
51 | def __call__(self, document: Document, provider: PdfProvider):
52 | if self.force_layout_block is not None:
53 | # Assign the full content of every page to a single layout type
54 | layout_results = self.forced_layout(document.pages)
55 | else:
56 | layout_results = self.surya_layout(document.pages)
57 | self.add_blocks_to_pages(document.pages, layout_results)
58 | self.expand_layout_blocks(document)
59 |
60 | def get_batch_size(self):
61 | if self.layout_batch_size is not None:
62 | return self.layout_batch_size
63 | elif settings.TORCH_DEVICE_MODEL == "cuda":
64 | return 12
65 | return 6
66 |
67 | def forced_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
68 | layout_results = []
69 | for page in pages:
70 | layout_results.append(
71 | LayoutResult(
72 | image_bbox=page.polygon.bbox,
73 | bboxes=[
74 | LayoutBox(
75 | label=self.force_layout_block,
76 | position=0,
77 | top_k={self.force_layout_block: 1},
78 | polygon=page.polygon.polygon,
79 | ),
80 | ],
81 | sliced=False,
82 | )
83 | )
84 | return layout_results
85 |
86 | def surya_layout(self, pages: List[PageGroup]) -> List[LayoutResult]:
87 | self.layout_model.disable_tqdm = self.disable_tqdm
88 | layout_results = self.layout_model(
89 | [p.get_image(highres=False) for p in pages],
90 | batch_size=int(self.get_batch_size()),
91 | )
92 | return layout_results
93 |
94 | def expand_layout_blocks(self, document: Document):
95 | for page in document.pages:
96 | # Collect all blocks on this page as PolygonBox for easy access
97 | page_blocks = [document.get_block(bid) for bid in page.structure]
98 | page_size = page.polygon.size
99 |
100 | for block_id in page.structure:
101 | block = document.get_block(block_id)
102 | if block.block_type in self.expand_block_types:
103 | other_blocks = [b for b in page_blocks if b != block]
104 | if not other_blocks:
105 | block.polygon = block.polygon.expand(
106 | self.max_expand_frac, self.max_expand_frac
107 | ).fit_to_bounds((0, 0, *page_size))
108 | continue
109 |
110 | min_gap = min(
111 | block.polygon.minimum_gap(other.polygon)
112 | for other in other_blocks
113 | )
114 | if min_gap <= 0:
115 | continue
116 |
117 | x_expand_frac = (
118 | min_gap / block.polygon.width if block.polygon.width > 0 else 0
119 | )
120 | y_expand_frac = (
121 | min_gap / block.polygon.height
122 | if block.polygon.height > 0
123 | else 0
124 | )
125 |
126 | block.polygon = block.polygon.expand(
127 | min(self.max_expand_frac, x_expand_frac),
128 | min(self.max_expand_frac, y_expand_frac),
129 | ).fit_to_bounds((0, 0, *page_size))
130 |
131 | def add_blocks_to_pages(
132 | self, pages: List[PageGroup], layout_results: List[LayoutResult]
133 | ):
134 | for page, layout_result in zip(pages, layout_results):
135 | layout_page_size = PolygonBox.from_bbox(layout_result.image_bbox).size
136 | provider_page_size = page.polygon.size
137 | page.layout_sliced = (
138 | layout_result.sliced
139 | ) # This indicates if the page was sliced by the layout model
140 | for bbox in sorted(layout_result.bboxes, key=lambda x: x.position):
141 | block_cls = get_block_class(BlockTypes[bbox.label])
142 | layout_block = page.add_block(
143 | block_cls, PolygonBox(polygon=bbox.polygon)
144 | )
145 | layout_block.polygon = layout_block.polygon.rescale(
146 | layout_page_size, provider_page_size
147 | ).fit_to_bounds((0, 0, *provider_page_size))
148 | layout_block.top_k = {
149 | BlockTypes[label]: prob
150 | for (label, prob) in bbox.top_k.items()
151 | if label in BlockTypes.__members__
152 | }
153 | page.add_structure(layout_block)
154 |
155 | # Ensure page has non-empty structure
156 | if page.structure is None:
157 | page.structure = []
158 |
159 | # Ensure page has non-empty children
160 | if page.children is None:
161 | page.children = []
162 |
```
--------------------------------------------------------------------------------
/marker/scripts/convert.py:
--------------------------------------------------------------------------------
```python
1 | import atexit
2 | import os
3 | import time
4 |
5 | import psutil
6 | import torch
7 |
8 | from marker.utils.batch import get_batch_sizes_worker_counts
9 |
10 | # Ensure threads don't contend
11 | os.environ["MKL_DYNAMIC"] = "FALSE"
12 | os.environ["OMP_DYNAMIC"] = "FALSE"
13 | os.environ["OMP_NUM_THREADS"] = "2" # Avoid OpenMP issues with multiprocessing
14 | os.environ["OPENBLAS_NUM_THREADS"] = "2"
15 | os.environ["MKL_NUM_THREADS"] = "2"
16 | os.environ["GRPC_VERBOSITY"] = "ERROR"
17 | os.environ["GLOG_minloglevel"] = "2"
18 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = (
19 | "1" # Transformers uses .isin for a simple op, which is not supported on MPS
20 | )
21 | os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya
22 |
23 | import math
24 | import traceback
25 |
26 | import click
27 | import torch.multiprocessing as mp
28 | from tqdm import tqdm
29 | import gc
30 |
31 | from marker.config.parser import ConfigParser
32 | from marker.config.printer import CustomClickPrinter
33 | from marker.logger import configure_logging, get_logger
34 | from marker.models import create_model_dict
35 | from marker.output import output_exists, save_output
36 | from marker.utils.gpu import GPUManager
37 |
38 | configure_logging()
39 | logger = get_logger()
40 |
41 |
42 | def worker_init():
43 | model_dict = create_model_dict()
44 |
45 | global model_refs
46 | model_refs = model_dict
47 |
48 | # Ensure we clean up the model references on exit
49 | atexit.register(worker_exit)
50 |
51 |
52 | def worker_exit():
53 | global model_refs
54 | try:
55 | del model_refs
56 | except Exception:
57 | pass
58 |
59 |
60 | def process_single_pdf(args):
61 | page_count = 0
62 | fpath, cli_options = args
63 | torch.set_num_threads(cli_options["total_torch_threads"])
64 | del cli_options["total_torch_threads"]
65 |
66 | config_parser = ConfigParser(cli_options)
67 |
68 | out_folder = config_parser.get_output_folder(fpath)
69 | base_name = config_parser.get_base_filename(fpath)
70 | if cli_options.get("skip_existing") and output_exists(out_folder, base_name):
71 | return page_count
72 |
73 | converter_cls = config_parser.get_converter_cls()
74 | config_dict = config_parser.generate_config_dict()
75 | config_dict["disable_tqdm"] = True
76 |
77 | try:
78 | if cli_options.get("debug_print"):
79 | logger.debug(f"Converting {fpath}")
80 | converter = converter_cls(
81 | config=config_dict,
82 | artifact_dict=model_refs,
83 | processor_list=config_parser.get_processors(),
84 | renderer=config_parser.get_renderer(),
85 | llm_service=config_parser.get_llm_service(),
86 | )
87 | rendered = converter(fpath)
88 | out_folder = config_parser.get_output_folder(fpath)
89 | save_output(rendered, out_folder, base_name)
90 | page_count = converter.page_count
91 |
92 | if cli_options.get("debug_print"):
93 | logger.debug(f"Converted {fpath}")
94 | del rendered
95 | del converter
96 | except Exception as e:
97 | logger.error(f"Error converting {fpath}: {e}")
98 | traceback.print_exc()
99 | finally:
100 | gc.collect()
101 |
102 | return page_count
103 |
104 |
105 | @click.command(cls=CustomClickPrinter)
106 | @click.argument("in_folder", type=str)
107 | @click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert")
108 | @click.option(
109 | "--num_chunks",
110 | type=int,
111 | default=1,
112 | help="Number of chunks being processed in parallel",
113 | )
114 | @click.option(
115 | "--max_files", type=int, default=None, help="Maximum number of pdfs to convert"
116 | )
117 | @click.option(
118 | "--skip_existing",
119 | is_flag=True,
120 | default=False,
121 | help="Skip existing converted files.",
122 | )
123 | @click.option(
124 | "--debug_print", is_flag=True, default=False, help="Print debug information."
125 | )
126 | @click.option(
127 | "--max_tasks_per_worker",
128 | type=int,
129 | default=10,
130 | help="Maximum number of tasks per worker process before recycling.",
131 | )
132 | @click.option(
133 | "--workers",
134 | type=int,
135 | default=None,
136 | help="Number of worker processes to use. Set automatically by default, but can be overridden.",
137 | )
138 | @ConfigParser.common_options
139 | def convert_cli(in_folder: str, **kwargs):
140 | total_pages = 0
141 | in_folder = os.path.abspath(in_folder)
142 | files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
143 | files = [f for f in files if os.path.isfile(f)]
144 |
145 | # Handle chunks if we're processing in parallel
146 | # Ensure we get all files into a chunk
147 | chunk_size = math.ceil(len(files) / kwargs["num_chunks"])
148 | start_idx = kwargs["chunk_idx"] * chunk_size
149 | end_idx = start_idx + chunk_size
150 | files_to_convert = files[start_idx:end_idx]
151 |
152 | # Limit files converted if needed
153 | if kwargs["max_files"]:
154 | files_to_convert = files_to_convert[: kwargs["max_files"]]
155 |
156 | # Disable nested multiprocessing
157 | kwargs["disable_multiprocessing"] = True
158 |
159 | try:
160 | mp.set_start_method("spawn") # Required for CUDA, forkserver doesn't work
161 | except RuntimeError:
162 | raise RuntimeError(
163 | "Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again."
164 | )
165 |
166 | chunk_idx = kwargs["chunk_idx"]
167 |
168 | # Use GPU context manager for automatic setup/cleanup
169 | with GPUManager(chunk_idx) as gpu_manager:
170 | batch_sizes, workers = get_batch_sizes_worker_counts(gpu_manager, 7)
171 |
172 | # Override workers if specified
173 | if kwargs["workers"] is not None:
174 | workers = kwargs["workers"]
175 |
176 | # Set proper batch sizes and thread counts
177 | total_processes = max(1, min(len(files_to_convert), workers))
178 | kwargs["total_torch_threads"] = max(
179 | 2, psutil.cpu_count(logical=False) // total_processes
180 | )
181 | kwargs.update(batch_sizes)
182 |
183 | logger.info(
184 | f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}"
185 | )
186 | task_args = [(f, kwargs) for f in files_to_convert]
187 |
188 | start_time = time.time()
189 | with mp.Pool(
190 | processes=total_processes,
191 | initializer=worker_init,
192 | maxtasksperchild=kwargs["max_tasks_per_worker"],
193 | ) as pool:
194 | pbar = tqdm(total=len(task_args), desc="Processing PDFs", unit="pdf")
195 | for page_count in pool.imap_unordered(process_single_pdf, task_args):
196 | pbar.update(1)
197 | total_pages += page_count
198 | pbar.close()
199 |
200 | total_time = time.time() - start_time
201 | print(
202 | f"Inferenced {total_pages} pages in {total_time:.2f} seconds, for a throughput of {total_pages / total_time:.2f} pages/sec for chunk {chunk_idx + 1}/{kwargs['num_chunks']}"
203 | )
204 |
```
--------------------------------------------------------------------------------
/marker/processors/llm/__init__.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | import traceback
3 | from concurrent.futures import ThreadPoolExecutor, as_completed
4 | from typing import Annotated, TypedDict, List, Sequence
5 |
6 | from pydantic import BaseModel
7 | from tqdm import tqdm
8 | from PIL import Image
9 |
10 | from marker.output import json_to_html
11 | from marker.processors import BaseProcessor
12 | from marker.schema import BlockTypes
13 | from marker.schema.blocks import Block, BlockId
14 | from marker.schema.document import Document
15 | from marker.schema.groups import PageGroup
16 | from marker.services import BaseService
17 | from marker.util import assign_config
18 | from marker.logger import get_logger
19 |
20 | logger = get_logger()
21 |
22 |
23 | class PromptData(TypedDict):
24 | prompt: str
25 | image: Image.Image
26 | block: Block
27 | schema: BaseModel
28 | page: PageGroup
29 | additional_data: dict | None
30 |
31 |
32 | class BlockData(TypedDict):
33 | page: PageGroup
34 | block: Block
35 |
36 |
37 | class BaseLLMProcessor(BaseProcessor):
38 | """
39 | A processor for using LLMs to convert blocks.
40 | """
41 |
42 | max_concurrency: Annotated[
43 | int,
44 | "The maximum number of concurrent requests to make to the Gemini model.",
45 | ] = 3
46 | image_expansion_ratio: Annotated[
47 | float,
48 | "The ratio to expand the image by when cropping.",
49 | ] = 0.01
50 | use_llm: Annotated[
51 | bool,
52 | "Whether to use the LLM model.",
53 | ] = False
54 | disable_tqdm: Annotated[
55 | bool,
56 | "Whether to disable the tqdm progress bar.",
57 | ] = False
58 | block_types = None
59 |
60 | def __init__(self, llm_service: BaseService, config=None):
61 | super().__init__(config)
62 |
63 | self.llm_service = None
64 | if not self.use_llm:
65 | return
66 |
67 | self.llm_service = llm_service
68 |
69 | def extract_image(
70 | self,
71 | document: Document,
72 | image_block: Block,
73 | remove_blocks: Sequence[BlockTypes] | None = None,
74 | ) -> Image.Image:
75 | return image_block.get_image(
76 | document,
77 | highres=True,
78 | expansion=(self.image_expansion_ratio, self.image_expansion_ratio),
79 | remove_blocks=remove_blocks,
80 | )
81 |
82 | def normalize_block_json(self, block: Block, document: Document, page: PageGroup):
83 | """
84 | Get the normalized JSON representation of a block for the LLM.
85 | """
86 | page_width = page.polygon.width
87 | page_height = page.polygon.height
88 | block_bbox = block.polygon.bbox
89 |
90 | # Normalize bbox to 0-1000 range
91 | normalized_bbox = [
92 | (block_bbox[0] / page_width) * 1000,
93 | (block_bbox[1] / page_height) * 1000,
94 | (block_bbox[2] / page_width) * 1000,
95 | (block_bbox[3] / page_height) * 1000,
96 | ]
97 |
98 | block_json = {
99 | "id": str(block.id),
100 | "block_type": str(block.id.block_type),
101 | "bbox": normalized_bbox,
102 | "html": json_to_html(block.render(document)),
103 | }
104 |
105 | return block_json
106 |
107 | def load_blocks(self, response: dict):
108 | return [json.loads(block) for block in response["blocks"]]
109 |
110 | def handle_rewrites(self, blocks: list, document: Document):
111 | for block_data in blocks:
112 | try:
113 | block_id = block_data["id"].strip().lstrip("/")
114 | _, page_id, block_type, block_id = block_id.split("/")
115 | block_id = BlockId(
116 | page_id=page_id,
117 | block_id=block_id,
118 | block_type=getattr(BlockTypes, block_type),
119 | )
120 | block = document.get_block(block_id)
121 | if not block:
122 | logger.debug(f"Block {block_id} not found in document")
123 | continue
124 |
125 | if hasattr(block, "html"):
126 | block.html = block_data["html"]
127 | except Exception as e:
128 | logger.debug(f"Error parsing block ID {block_data['id']}: {e}")
129 | continue
130 |
131 |
132 | class BaseLLMComplexBlockProcessor(BaseLLMProcessor):
133 | """
134 | A processor for using LLMs to convert blocks with more complex logic.
135 | """
136 |
137 | def __call__(self, document: Document):
138 | if not self.use_llm or self.llm_service is None:
139 | return
140 |
141 | try:
142 | self.rewrite_blocks(document)
143 | except Exception as e:
144 | logger.warning(f"Error rewriting blocks in {self.__class__.__name__}: {e}")
145 |
146 | def process_rewriting(self, document: Document, page: PageGroup, block: Block):
147 | raise NotImplementedError()
148 |
149 | def rewrite_blocks(self, document: Document):
150 | # Don't show progress if there are no blocks to process
151 | total_blocks = sum(
152 | len(page.contained_blocks(document, self.block_types))
153 | for page in document.pages
154 | )
155 | if total_blocks == 0:
156 | return
157 |
158 | pbar = tqdm(
159 | total=total_blocks,
160 | desc=f"{self.__class__.__name__} running",
161 | disable=self.disable_tqdm
162 | )
163 | with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
164 | for future in as_completed(
165 | [
166 | executor.submit(self.process_rewriting, document, page, block)
167 | for page in document.pages
168 | for block in page.contained_blocks(document, self.block_types)
169 | ]
170 | ):
171 | future.result() # Raise exceptions if any occurred
172 | pbar.update(1)
173 |
174 | pbar.close()
175 |
176 |
177 | class BaseLLMSimpleBlockProcessor(BaseLLMProcessor):
178 | """
179 | A processor for using LLMs to convert single blocks.
180 | """
181 |
182 | # Override init since we don't need an llmservice here
183 | def __init__(self, config=None):
184 | assign_config(self, config)
185 |
186 | def __call__(self, result: dict, prompt_data: PromptData, document: Document):
187 | try:
188 | self.rewrite_block(result, prompt_data, document)
189 | except Exception as e:
190 | logger.warning(f"Error rewriting block in {self.__class__.__name__}: {e}")
191 | traceback.print_exc()
192 |
193 | def inference_blocks(self, document: Document) -> List[BlockData]:
194 | blocks = []
195 | for page in document.pages:
196 | for block in page.contained_blocks(document, self.block_types):
197 | blocks.append({"page": page, "block": block})
198 | return blocks
199 |
200 | def block_prompts(self, document: Document) -> List[PromptData]:
201 | raise NotImplementedError()
202 |
203 | def rewrite_block(
204 | self, response: dict, prompt_data: PromptData, document: Document
205 | ):
206 | raise NotImplementedError()
207 |
```
--------------------------------------------------------------------------------
/benchmarks/overall/scorers/llm.py:
--------------------------------------------------------------------------------
```python
1 | import json
2 | import os
3 | import tempfile
4 | import time
5 | from typing import List
6 |
7 | from PIL import Image
8 | from google.genai.errors import APIError
9 | from google import genai
10 | import pypdfium2 as pdfium
11 |
12 | from benchmarks.overall.scorers import BaseScorer, BlockScores
13 | from marker.settings import settings
14 |
15 | rating_prompt = """
16 | You're a document analysis expert who is comparing some markdown to an image to make sure the markdown is correct. You're rating how effectively the provided markdown represents the full text and formatting in the image provided.
17 | You're given an image, along with the extracted markdown:
18 | - Some parts of the page may have been recognized as images and linked from the markdown, like ``.
19 | - Tables will be formatted as Github flavored markdown.
20 | - Block equations will be in LaTeX.
21 | - The image and markdown may be in any language.
22 | - The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text.
23 |
24 | The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided.
25 |
26 | **Instructions**
27 | Follow this process to evaluate the markdown:
28 | 1. Carefully examine the image.
29 | 2. Carefully examine the markdown input provided.
30 | 3. Compare the image to the markdown representation. Does the markdown representation properly represent the important text and formatting in the image?
31 | 4. Assign component scores, as described below.
32 |
33 | These are the primary scores:
34 | - Overall - the overall quality of the markdown as compared to the image.
35 | - Text quality - the quality of the text extraction from the image.
36 | - Formatting quality - the quality of the formatting applied to the markdown, as compared to the image.
37 |
38 | Depending on which elements are present in the markdown, you will assign element-specific scores.
39 | - Tables - how effectively the tables have been extracted and formatted.
40 | - Forms - how effectively the forms have extracted and formatted.
41 | - Equations - how effectively block equations have been converted to LaTeX.
42 | - Section headers - if all of the section headers have been detected, and the right levels set.
43 | - Lists - if the lists have been properly extracted and formatted.
44 | - Images - if images are identified and placed correctly.
45 |
46 | Notes on scoring:
47 | - To get a 5/5, all of the important text from the image must appear in the markdown, and the formatting should be correct (minor mistakes okay). It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings. If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text.
48 | - A 3/5 may have small missing text elements from the markdown and/or moderate formatting issues.
49 | - A 1/5 will have major missing text segments from the markdown or completely unreadable formatting.
50 | - Use 0/5 if a field isn't applicable, like if the image doesn't contain a table.
51 |
52 | If text that is important to the meaning of the document is missing, do not score higher than 3/5.
53 |
54 | Output json, like in the example below.
55 |
56 | **Example**
57 | Input
58 | ```markdown
59 | # Section 1
60 | This is some *markdown* extracted from a document. Here is a block equation:
61 | $$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$
62 | ```
63 | Output
64 | ```json
65 | {
66 | "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.",
67 | "markdown_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.",
68 | "comparison": "The text and formatting matches the image. There are no formatting or text extraction issues. The equations and section headers are correct.",
69 | "overall": 5,
70 | "text": 5,
71 | "formatting": 5,
72 | "section_headers": 5,
73 | "tables": 0,
74 | "forms": 0,
75 | "equations": 5,
76 | "lists": 0,
77 | "images": 0
78 | }
79 | ```
80 | **Input**
81 | ```markdown
82 | {{markdown}}
83 | ```
84 | **Output**
85 | """
86 |
87 | comparison_keys = ["comparison"]
88 | description_keys = ["image_description", "markdown_description"]
89 | text_keys = comparison_keys + description_keys
90 | score_keys = ["overall", "text", "formatting", "section_headers", "tables", "forms", "equations",
91 | "lists", "images"]
92 |
93 |
94 | class LLMScorer(BaseScorer):
95 | def __call__(self, sample, gt_markdown: List[str], markdown: str) -> BlockScores:
96 | pdf_bytes = sample["pdf"]
97 | with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
98 | f.write(pdf_bytes)
99 | f.flush()
100 | f.seek(0)
101 | doc = pdfium.PdfDocument(f.name)
102 | img = doc[0].render(scale=96/72).to_pil()
103 | doc.close()
104 |
105 | return self.llm_rater(img, markdown)
106 |
107 |
108 | def llm_rater(self, img: Image.Image, markdown: str) -> BlockScores:
109 | if not markdown:
110 | null_scores = {k: 1 for k in score_keys}
111 | text_scores = {k: "" for k in text_keys}
112 | null_scores.update(text_scores)
113 | return {
114 | "score": 1,
115 | "specific_scores": null_scores
116 | }
117 | req_keys = text_keys + score_keys
118 | properties = {}
119 | for key in req_keys:
120 | content_type = "INTEGER" if key in score_keys else "STRING"
121 | properties[key] = {"type": content_type}
122 |
123 | response_schema = {
124 | "required": req_keys,
125 | "properties": properties,
126 | "type": "OBJECT"
127 | }
128 | prompt = rating_prompt.replace("{{markdown}}", markdown)
129 | response = self.llm_response_wrapper([img, prompt], response_schema)
130 | assert all([k in response for k in req_keys]), f"Missing keys in response: {response}"
131 | return {
132 | "score": response["overall"],
133 | "specific_scores": response,
134 | }
135 |
136 | def llm_response_wrapper(self, prompt, response_schema, depth=0):
137 | client = genai.Client(
138 | http_options={"timeout": 60000},
139 | vertexai=True,
140 | project=os.getenv("VERTEX_PROJECT_ID"),
141 | location=os.getenv("VERTEX_LOCATION"),
142 | )
143 | try:
144 | responses = client.models.generate_content(
145 | model="gemini-2.0-flash-001",
146 | contents=prompt,
147 | config={
148 | "temperature": 0,
149 | "response_schema": response_schema,
150 | "response_mime_type": "application/json",
151 | },
152 | )
153 | output = responses.candidates[0].content.parts[0].text
154 | return json.loads(output)
155 | except APIError as e:
156 | print(f"Hit Gemini rate limit, waiting 120 seconds")
157 | time.sleep(120)
158 | if depth > 2:
159 | raise e
160 | return self.llm_response_wrapper(prompt, response_schema, depth + 1)
```
--------------------------------------------------------------------------------
/tests/processors/test_llm_processors.py:
--------------------------------------------------------------------------------
```python
1 | from unittest.mock import MagicMock, Mock
2 |
3 | import pytest
4 | from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
5 | from marker.processors.llm.llm_equation import LLMEquationProcessor
6 |
7 | from marker.processors.llm.llm_form import LLMFormProcessor
8 | from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor
9 | from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor
10 | from marker.processors.llm.llm_table import LLMTableProcessor
11 | from marker.processors.table import TableProcessor
12 | from marker.renderers.markdown import MarkdownRenderer
13 | from marker.schema import BlockTypes
14 | from marker.schema.blocks import ComplexRegion
15 |
16 |
17 | @pytest.mark.filename("form_1040.pdf")
18 | @pytest.mark.config({"page_range": [0]})
19 | def test_llm_form_processor_no_config(pdf_document, llm_service):
20 | processor_lst = [LLMFormProcessor()]
21 | processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service)
22 | processor(pdf_document)
23 |
24 | forms = pdf_document.contained_blocks((BlockTypes.Form,))
25 | assert forms[0].html is None
26 |
27 |
28 | @pytest.mark.filename("form_1040.pdf")
29 | @pytest.mark.config({"page_range": [0]})
30 | def test_llm_form_processor_no_cells(pdf_document, llm_service):
31 | config = {"use_llm": True, "gemini_api_key": "test"}
32 | processor_lst = [LLMFormProcessor(config)]
33 | processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service, config)
34 | processor(pdf_document)
35 |
36 | forms = pdf_document.contained_blocks((BlockTypes.Form,))
37 | assert forms[0].html is None
38 |
39 |
40 | @pytest.mark.filename("form_1040.pdf")
41 | @pytest.mark.config({"page_range": [0]})
42 | def test_llm_form_processor(pdf_document, table_rec_model, recognition_model, detection_model):
43 | corrected_html = "<em>This is corrected markdown.</em>\n" * 100
44 | corrected_html = "<p>" + corrected_html.strip() + "</p>\n"
45 |
46 | mock_cls = Mock()
47 | mock_cls.return_value = {"corrected_html": corrected_html}
48 |
49 | cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model)
50 | cell_processor(pdf_document)
51 |
52 | config = {"use_llm": True, "gemini_api_key": "test"}
53 | processor_lst = [LLMFormProcessor(config)]
54 | processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
55 | processor(pdf_document)
56 |
57 | forms = pdf_document.contained_blocks((BlockTypes.Form,))
58 | assert forms[0].html == corrected_html.strip()
59 |
60 |
61 |
62 | @pytest.mark.filename("table_ex2.pdf")
63 | @pytest.mark.config({"page_range": [0]})
64 | def test_llm_table_processor(pdf_document, table_rec_model, recognition_model, detection_model):
65 | corrected_html = """
66 | <table>
67 | <tr>
68 | <td>Column 1</td>
69 | <td>Column 2</td>
70 | <td>Column 3</td>
71 | <td>Column 4</td>
72 | </tr>
73 | <tr>
74 | <td>Value 1 <math>x</math></td>
75 | <td>Value 2</td>
76 | <td>Value 3</td>
77 | <td>Value 4</td>
78 | </tr>
79 | <tr>
80 | <td>Value 5</td>
81 | <td>Value 6</td>
82 | <td>Value 7</td>
83 | <td>Value 8</td>
84 | </tr>
85 | </table>
86 | """.strip()
87 |
88 | mock_cls = Mock()
89 | mock_cls.return_value = {"corrected_html": corrected_html}
90 |
91 | cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model)
92 | cell_processor(pdf_document)
93 |
94 | processor = LLMTableProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"})
95 | processor(pdf_document)
96 |
97 | tables = pdf_document.contained_blocks((BlockTypes.Table,))
98 | table_cells = tables[0].contained_blocks(pdf_document, (BlockTypes.TableCell,))
99 | assert table_cells[0].text == "Column 1"
100 |
101 | markdown = MarkdownRenderer()(pdf_document).markdown
102 | assert "Value 1 $x$" in markdown
103 |
104 |
105 | @pytest.mark.filename("A17_FlightPlan.pdf")
106 | @pytest.mark.config({"page_range": [0]})
107 | def test_llm_caption_processor_disabled(pdf_document):
108 | config = {"use_llm": True, "gemini_api_key": "test"}
109 | mock_cls = MagicMock()
110 | processor_lst = [LLMImageDescriptionProcessor(config)]
111 | processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
112 | processor(pdf_document)
113 |
114 | contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure))
115 | assert all(picture.description is None for picture in contained_pictures)
116 |
117 | @pytest.mark.filename("A17_FlightPlan.pdf")
118 | @pytest.mark.config({"page_range": [0]})
119 | def test_llm_caption_processor(pdf_document):
120 | description = "This is an image description."
121 | mock_cls = Mock()
122 | mock_cls.return_value = {"image_description": description}
123 |
124 | config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False}
125 | processor_lst = [LLMImageDescriptionProcessor(config)]
126 | processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
127 | processor(pdf_document)
128 |
129 | contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure))
130 | assert all(picture.description == description for picture in contained_pictures)
131 |
132 | # Ensure the rendering includes the description
133 | renderer = MarkdownRenderer({"extract_images": False})
134 | md = renderer(pdf_document).markdown
135 |
136 | assert description in md
137 |
138 |
139 | @pytest.mark.filename("A17_FlightPlan.pdf")
140 | @pytest.mark.config({"page_range": [0]})
141 | def test_llm_complex_region_processor(pdf_document):
142 | md = "This is some *markdown* for a complex region."
143 | mock_cls = Mock()
144 | mock_cls.return_value = {"corrected_markdown": md * 25}
145 |
146 | # Replace the block with a complex region
147 | old_block = pdf_document.pages[0].children[0]
148 | new_block = ComplexRegion(
149 | **old_block.dict(exclude=["id", "block_id", "block_type"]),
150 | )
151 | pdf_document.pages[0].replace_block(old_block, new_block)
152 |
153 | # Test processor
154 | config = {"use_llm": True, "gemini_api_key": "test"}
155 | processor_lst = [LLMComplexRegionProcessor(config)]
156 | processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
157 | processor(pdf_document)
158 |
159 | # Ensure the rendering includes the description
160 | renderer = MarkdownRenderer()
161 | rendered_md = renderer(pdf_document).markdown
162 |
163 | assert md in rendered_md
164 |
165 | @pytest.mark.filename("adversarial.pdf")
166 | @pytest.mark.config({"page_range": [0]})
167 | def test_multi_llm_processors(pdf_document):
168 | description = "<math>This is an image description. And here is a lot of writing about it.</math>" * 10
169 | mock_cls = Mock()
170 | mock_cls.return_value = {"image_description": description, "corrected_equation": description}
171 |
172 | config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False, "min_equation_height": .001}
173 | processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)]
174 | processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
175 | processor(pdf_document)
176 |
177 | contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure))
178 | assert all(picture.description == description for picture in contained_pictures)
179 |
180 | contained_equations = pdf_document.contained_blocks((BlockTypes.Equation,))
181 | print([equation.html for equation in contained_equations])
182 | assert all(equation.html == description for equation in contained_equations)
```
--------------------------------------------------------------------------------
/benchmarks/table/inference.py:
--------------------------------------------------------------------------------
```python
1 | from typing import List
2 |
3 | import numpy as np
4 | from bs4 import BeautifulSoup
5 | import pypdfium2 as pdfium
6 | from tqdm import tqdm
7 | import base64
8 | import tempfile
9 |
10 | from benchmarks.table.gemini import gemini_table_rec
11 | from marker.config.parser import ConfigParser
12 | from marker.converters.table import TableConverter
13 | from marker.models import create_model_dict
14 | from marker.processors.llm.llm_table import LLMTableProcessor
15 | from marker.processors.table import TableProcessor
16 | from marker.renderers.json import JSONBlockOutput
17 | from marker.schema.polygon import PolygonBox
18 | from marker.util import matrix_intersection_area
19 |
20 |
21 | def extract_tables(children: List[JSONBlockOutput]):
22 | tables = []
23 | for child in children:
24 | if child.block_type == 'Table':
25 | tables.append(child)
26 | elif child.children:
27 | tables.extend(extract_tables(child.children))
28 | return tables
29 |
30 | def fix_table_html(table_html: str) -> str:
31 | marker_table_soup = BeautifulSoup(table_html, 'html.parser')
32 | tbody = marker_table_soup.find('tbody')
33 | if tbody:
34 | tbody.unwrap()
35 | for th_tag in marker_table_soup.find_all('th'):
36 | th_tag.name = 'td'
37 | for br_tag in marker_table_soup.find_all('br'):
38 | br_tag.replace_with(marker_table_soup.new_string(''))
39 |
40 | marker_table_html = str(marker_table_soup)
41 | marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines
42 | return marker_table_html
43 |
44 |
45 | def inference_tables(dataset, use_llm: bool, table_rec_batch_size: int | None, max_rows: int, use_gemini: bool):
46 | models = create_model_dict()
47 | config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True})
48 | total_unaligned = 0
49 | results = []
50 |
51 | iterations = len(dataset)
52 | if max_rows is not None:
53 | iterations = min(max_rows, len(dataset))
54 |
55 | for i in tqdm(range(iterations), desc='Converting Tables'):
56 | try:
57 | row = dataset[i]
58 | pdf_binary = base64.b64decode(row['pdf'])
59 | gt_tables = row['tables'] # Already sorted by reading order, which is what marker returns
60 |
61 | # Only use the basic table processors
62 | converter = TableConverter(
63 | config=config_parser.generate_config_dict(),
64 | artifact_dict=models,
65 | processor_list=[
66 | "marker.processors.table.TableProcessor",
67 | "marker.processors.llm.llm_table.LLMTableProcessor",
68 | ],
69 | renderer=config_parser.get_renderer()
70 | )
71 |
72 | with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file:
73 | temp_pdf_file.write(pdf_binary)
74 | temp_pdf_file.seek(0)
75 | marker_json = converter(temp_pdf_file.name).children
76 |
77 | doc = pdfium.PdfDocument(temp_pdf_file.name)
78 | page_image = doc[0].render(scale=96/72).to_pil()
79 | doc.close()
80 |
81 | if len(marker_json) == 0 or len(gt_tables) == 0:
82 | print(f'No tables detected, skipping...')
83 | total_unaligned += len(gt_tables)
84 | continue
85 |
86 | marker_tables = extract_tables(marker_json)
87 | marker_table_boxes = [table.bbox for table in marker_tables]
88 | page_bbox = marker_json[0].bbox
89 |
90 | if len(marker_tables) != len(gt_tables):
91 | print(f'Number of tables do not match, skipping...')
92 | total_unaligned += len(gt_tables)
93 | continue
94 |
95 | table_images = [
96 | page_image.crop(
97 | PolygonBox.from_bbox(bbox)
98 | .rescale(
99 | (page_bbox[2], page_bbox[3]), (page_image.width, page_image.height)
100 | ).bbox
101 | )
102 | for bbox
103 | in marker_table_boxes
104 | ]
105 |
106 | # Normalize the bboxes
107 | for bbox in marker_table_boxes:
108 | bbox[0] = bbox[0] / page_bbox[2]
109 | bbox[1] = bbox[1] / page_bbox[3]
110 | bbox[2] = bbox[2] / page_bbox[2]
111 | bbox[3] = bbox[3] / page_bbox[3]
112 |
113 | gt_boxes = [table['normalized_bbox'] for table in gt_tables]
114 | gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes]
115 | marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes]
116 | table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes)
117 |
118 | aligned_tables = []
119 | used_tables = set()
120 | unaligned_tables = set()
121 | for table_idx, alignment in enumerate(table_alignments):
122 | try:
123 | max_area = np.max(alignment)
124 | aligned_idx = np.argmax(alignment)
125 | except ValueError:
126 | # No alignment found
127 | unaligned_tables.add(table_idx)
128 | continue
129 |
130 | if max_area <= .01:
131 | # No alignment found
132 | unaligned_tables.add(table_idx)
133 | continue
134 |
135 | if aligned_idx in used_tables:
136 | # Marker table already aligned with another gt table
137 | unaligned_tables.add(table_idx)
138 | continue
139 |
140 | # Gt table doesn't align well with any marker table
141 | gt_table_pct = gt_areas[table_idx] / max_area
142 | if not .85 < gt_table_pct < 1.15:
143 | unaligned_tables.add(table_idx)
144 | continue
145 |
146 | # Marker table doesn't align with gt table
147 | marker_table_pct = marker_areas[aligned_idx] / max_area
148 | if not .85 < marker_table_pct < 1.15:
149 | unaligned_tables.add(table_idx)
150 | continue
151 |
152 | gemini_html = ""
153 | if use_gemini:
154 | try:
155 | gemini_html = gemini_table_rec(table_images[aligned_idx])
156 | except Exception as e:
157 | print(f'Gemini failed: {e}')
158 |
159 | aligned_tables.append(
160 | (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html)
161 | )
162 | used_tables.add(aligned_idx)
163 |
164 | total_unaligned += len(unaligned_tables)
165 |
166 | for marker_table, gt_table, gemini_table in aligned_tables:
167 | gt_table_html = gt_table['html']
168 |
169 | # marker wraps the table in <tbody> which fintabnet data doesn't
170 | # Fintabnet doesn't use th tags, need to be replaced for fair comparison
171 | marker_table_html = fix_table_html(marker_table.html)
172 | gemini_table_html = fix_table_html(gemini_table)
173 |
174 | results.append({
175 | "marker_table": marker_table_html,
176 | "gt_table": gt_table_html,
177 | "gemini_table": gemini_table_html
178 | })
179 | except pdfium.PdfiumError:
180 | print('Broken PDF, Skipping...')
181 | continue
182 | return results, total_unaligned
```