This is page 3 of 9. Use http://codebase.md/datalab-to/marker?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .github │ ├── ISSUE_TEMPLATE │ │ ├── breaking-bug-report.md │ │ ├── feature_request.md │ │ └── output-bug-report.md │ └── workflows │ ├── benchmarks.yml │ ├── ci.yml │ ├── cla.yml │ ├── publish.yml │ └── scripts.yml ├── .gitignore ├── .pre-commit-config.yaml ├── benchmarks │ ├── __init__.py │ ├── overall │ │ ├── __init__.py │ │ ├── display │ │ │ ├── __init__.py │ │ │ ├── dataset.py │ │ │ └── table.py │ │ ├── download │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── llamaparse.py │ │ │ ├── main.py │ │ │ ├── mathpix.py │ │ │ └── mistral.py │ │ ├── elo.py │ │ ├── methods │ │ │ ├── __init__.py │ │ │ ├── docling.py │ │ │ ├── gt.py │ │ │ ├── llamaparse.py │ │ │ ├── marker.py │ │ │ ├── mathpix.py │ │ │ ├── mistral.py │ │ │ ├── olmocr.py │ │ │ └── schema.py │ │ ├── overall.py │ │ ├── registry.py │ │ ├── schema.py │ │ └── scorers │ │ ├── __init__.py │ │ ├── clean.py │ │ ├── heuristic.py │ │ ├── llm.py │ │ └── schema.py │ ├── table │ │ ├── __init__.py │ │ ├── gemini.py │ │ ├── inference.py │ │ ├── scoring.py │ │ └── table.py │ ├── throughput │ │ ├── __init__.py │ │ └── main.py │ └── verify_scores.py ├── chunk_convert.py ├── CLA.md ├── convert_single.py ├── convert.py ├── data │ ├── .gitignore │ ├── examples │ │ ├── json │ │ │ ├── multicolcnn.json │ │ │ ├── switch_trans.json │ │ │ └── thinkpython.json │ │ └── markdown │ │ ├── multicolcnn │ │ │ ├── _page_1_Figure_0.jpeg │ │ │ ├── _page_2_Picture_0.jpeg │ │ │ ├── _page_6_Figure_0.jpeg │ │ │ ├── _page_7_Figure_0.jpeg │ │ │ ├── multicolcnn_meta.json │ │ │ └── multicolcnn.md │ │ ├── switch_transformers │ │ │ ├── _page_11_Figure_4.jpeg │ │ │ ├── _page_12_Figure_4.jpeg │ │ │ ├── _page_13_Figure_2.jpeg │ │ │ ├── _page_18_Figure_1.jpeg │ │ │ ├── _page_18_Figure_3.jpeg │ │ │ ├── _page_2_Figure_3.jpeg │ │ │ ├── _page_20_Figure_1.jpeg │ │ │ ├── _page_20_Figure_4.jpeg │ │ │ ├── _page_27_Figure_1.jpeg │ │ │ ├── _page_29_Figure_1.jpeg │ │ │ ├── _page_30_Figure_1.jpeg │ │ │ ├── _page_31_Figure_3.jpeg │ │ │ ├── _page_4_Figure_1.jpeg │ │ │ ├── _page_5_Figure_3.jpeg │ │ │ ├── switch_trans_meta.json │ │ │ └── switch_trans.md │ │ └── thinkpython │ │ ├── _page_109_Figure_1.jpeg │ │ ├── _page_115_Figure_1.jpeg │ │ ├── _page_116_Figure_3.jpeg │ │ ├── _page_127_Figure_1.jpeg │ │ ├── _page_128_Figure_1.jpeg │ │ ├── _page_167_Figure_1.jpeg │ │ ├── _page_169_Figure_1.jpeg │ │ ├── _page_173_Figure_1.jpeg │ │ ├── _page_190_Figure_1.jpeg │ │ ├── _page_195_Figure_1.jpeg │ │ ├── _page_205_Figure_1.jpeg │ │ ├── _page_23_Figure_1.jpeg │ │ ├── _page_23_Figure_3.jpeg │ │ ├── _page_230_Figure_1.jpeg │ │ ├── _page_233_Figure_1.jpeg │ │ ├── _page_233_Figure_3.jpeg │ │ ├── _page_234_Figure_1.jpeg │ │ ├── _page_235_Figure_1.jpeg │ │ ├── _page_236_Figure_1.jpeg │ │ ├── _page_236_Figure_3.jpeg │ │ ├── _page_237_Figure_1.jpeg │ │ ├── _page_238_Figure_1.jpeg │ │ ├── _page_46_Figure_1.jpeg │ │ ├── _page_60_Figure_1.jpeg │ │ ├── _page_60_Figure_3.jpeg │ │ ├── _page_67_Figure_1.jpeg │ │ ├── _page_71_Figure_1.jpeg │ │ ├── _page_78_Figure_1.jpeg │ │ ├── _page_85_Figure_1.jpeg │ │ ├── _page_94_Figure_1.jpeg │ │ ├── _page_99_Figure_17.jpeg │ │ ├── _page_99_Figure_178.jpeg │ │ ├── thinkpython_meta.json │ │ └── thinkpython.md │ ├── images │ │ ├── overall.png │ │ ├── per_doc.png │ │ └── table.png │ └── latex_to_md.sh ├── examples │ ├── marker_modal_deployment.py │ └── README.md ├── extraction_app.py ├── LICENSE ├── marker │ ├── builders │ │ ├── __init__.py │ │ ├── document.py │ │ ├── layout.py │ │ ├── line.py │ │ ├── ocr.py │ │ └── structure.py │ ├── config │ │ ├── __init__.py │ │ ├── crawler.py │ │ ├── parser.py │ │ └── printer.py │ ├── converters │ │ ├── __init__.py │ │ ├── extraction.py │ │ ├── ocr.py │ │ ├── pdf.py │ │ └── table.py │ ├── extractors │ │ ├── __init__.py │ │ ├── document.py │ │ └── page.py │ ├── logger.py │ ├── models.py │ ├── output.py │ ├── processors │ │ ├── __init__.py │ │ ├── blank_page.py │ │ ├── block_relabel.py │ │ ├── blockquote.py │ │ ├── code.py │ │ ├── debug.py │ │ ├── document_toc.py │ │ ├── equation.py │ │ ├── footnote.py │ │ ├── ignoretext.py │ │ ├── line_merge.py │ │ ├── line_numbers.py │ │ ├── list.py │ │ ├── llm │ │ │ ├── __init__.py │ │ │ ├── llm_complex.py │ │ │ ├── llm_equation.py │ │ │ ├── llm_form.py │ │ │ ├── llm_handwriting.py │ │ │ ├── llm_image_description.py │ │ │ ├── llm_mathblock.py │ │ │ ├── llm_meta.py │ │ │ ├── llm_page_correction.py │ │ │ ├── llm_sectionheader.py │ │ │ ├── llm_table_merge.py │ │ │ └── llm_table.py │ │ ├── order.py │ │ ├── page_header.py │ │ ├── reference.py │ │ ├── sectionheader.py │ │ ├── table.py │ │ ├── text.py │ │ └── util.py │ ├── providers │ │ ├── __init__.py │ │ ├── document.py │ │ ├── epub.py │ │ ├── html.py │ │ ├── image.py │ │ ├── pdf.py │ │ ├── powerpoint.py │ │ ├── registry.py │ │ ├── spreadsheet.py │ │ └── utils.py │ ├── renderers │ │ ├── __init__.py │ │ ├── chunk.py │ │ ├── extraction.py │ │ ├── html.py │ │ ├── json.py │ │ ├── markdown.py │ │ └── ocr_json.py │ ├── schema │ │ ├── __init__.py │ │ ├── blocks │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── basetable.py │ │ │ ├── caption.py │ │ │ ├── code.py │ │ │ ├── complexregion.py │ │ │ ├── equation.py │ │ │ ├── figure.py │ │ │ ├── footnote.py │ │ │ ├── form.py │ │ │ ├── handwriting.py │ │ │ ├── inlinemath.py │ │ │ ├── listitem.py │ │ │ ├── pagefooter.py │ │ │ ├── pageheader.py │ │ │ ├── picture.py │ │ │ ├── reference.py │ │ │ ├── sectionheader.py │ │ │ ├── table.py │ │ │ ├── tablecell.py │ │ │ ├── text.py │ │ │ └── toc.py │ │ ├── document.py │ │ ├── groups │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── figure.py │ │ │ ├── list.py │ │ │ ├── page.py │ │ │ ├── picture.py │ │ │ └── table.py │ │ ├── polygon.py │ │ ├── registry.py │ │ └── text │ │ ├── __init__.py │ │ ├── char.py │ │ ├── line.py │ │ └── span.py │ ├── scripts │ │ ├── __init__.py │ │ ├── chunk_convert.py │ │ ├── chunk_convert.sh │ │ ├── common.py │ │ ├── convert_single.py │ │ ├── convert.py │ │ ├── extraction_app.py │ │ ├── file_to_s3.py │ │ ├── run_streamlit_app.py │ │ ├── server.py │ │ └── streamlit_app.py │ ├── services │ │ ├── __init__.py │ │ ├── azure_openai.py │ │ ├── claude.py │ │ ├── gemini.py │ │ ├── ollama.py │ │ ├── openai.py │ │ └── vertex.py │ ├── settings.py │ ├── util.py │ └── utils │ ├── __init__.py │ ├── batch.py │ ├── gpu.py │ └── image.py ├── marker_app.py ├── marker_server.py ├── poetry.lock ├── pyproject.toml ├── pytest.ini ├── README.md ├── signatures │ └── version1 │ └── cla.json ├── static │ └── fonts │ └── .gitignore └── tests ├── builders │ ├── test_blank_page.py │ ├── test_document_builder.py │ ├── test_garbled_pdf.py │ ├── test_layout_replace.py │ ├── test_ocr_builder.py │ ├── test_ocr_pipeline.py │ ├── test_overriding.py │ ├── test_pdf_links.py │ ├── test_rotated_bboxes.py │ ├── test_strip_existing_ocr.py │ └── test_structure.py ├── config │ └── test_config.py ├── conftest.py ├── converters │ ├── test_extraction_converter.py │ ├── test_ocr_converter.py │ ├── test_pdf_converter.py │ └── test_table_converter.py ├── processors │ ├── test_document_toc_processor.py │ ├── test_equation_processor.py │ ├── test_footnote_processor.py │ ├── test_ignoretext.py │ ├── test_llm_processors.py │ ├── test_table_merge.py │ └── test_table_processor.py ├── providers │ ├── test_document_providers.py │ ├── test_image_provider.py │ └── test_pdf_provider.py ├── renderers │ ├── test_chunk_renderer.py │ ├── test_extract_images.py │ ├── test_html_renderer.py │ ├── test_json_renderer.py │ └── test_markdown_renderer.py ├── schema │ └── groups │ └── test_list_grouping.py ├── services │ └── test_service_init.py └── utils.py ``` # Files -------------------------------------------------------------------------------- /marker/services/claude.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import time 3 | from typing import List, Annotated, T 4 | 5 | import PIL 6 | from PIL import Image 7 | import anthropic 8 | from anthropic import RateLimitError, APITimeoutError 9 | from marker.logger import get_logger 10 | from pydantic import BaseModel 11 | 12 | from marker.schema.blocks import Block 13 | from marker.services import BaseService 14 | 15 | logger = get_logger() 16 | 17 | 18 | class ClaudeService(BaseService): 19 | claude_model_name: Annotated[ 20 | str, "The name of the Google model to use for the service." 21 | ] = "claude-3-7-sonnet-20250219" 22 | claude_api_key: Annotated[str, "The Claude API key to use for the service."] = None 23 | max_claude_tokens: Annotated[ 24 | int, "The maximum number of tokens to use for a single Claude request." 25 | ] = 8192 26 | 27 | def process_images(self, images: List[Image.Image]) -> List[dict]: 28 | return [ 29 | { 30 | "type": "image", 31 | "source": { 32 | "type": "base64", 33 | "media_type": "image/webp", 34 | "data": self.img_to_base64(img), 35 | }, 36 | } 37 | for img in images 38 | ] 39 | 40 | def validate_response(self, response_text: str, schema: type[T]) -> T: 41 | response_text = response_text.strip() 42 | if response_text.startswith("```json"): 43 | response_text = response_text[7:] 44 | if response_text.endswith("```"): 45 | response_text = response_text[:-3] 46 | 47 | try: 48 | # Try to parse as JSON first 49 | out_schema = schema.model_validate_json(response_text) 50 | out_json = out_schema.model_dump() 51 | return out_json 52 | except Exception: 53 | try: 54 | # Re-parse with fixed escapes 55 | escaped_str = response_text.replace("\\", "\\\\") 56 | out_schema = schema.model_validate_json(escaped_str) 57 | return out_schema.model_dump() 58 | except Exception: 59 | return 60 | 61 | def get_client(self): 62 | return anthropic.Anthropic( 63 | api_key=self.claude_api_key, 64 | ) 65 | 66 | def __call__( 67 | self, 68 | prompt: str, 69 | image: PIL.Image.Image | List[PIL.Image.Image] | None, 70 | block: Block | None, 71 | response_schema: type[BaseModel], 72 | max_retries: int | None = None, 73 | timeout: int | None = None, 74 | ): 75 | if max_retries is None: 76 | max_retries = self.max_retries 77 | 78 | if timeout is None: 79 | timeout = self.timeout 80 | 81 | schema_example = response_schema.model_json_schema() 82 | system_prompt = f""" 83 | Follow the instructions given by the user prompt. You must provide your response in JSON format matching this schema: 84 | 85 | {json.dumps(schema_example, indent=2)} 86 | 87 | Respond only with the JSON schema, nothing else. Do not include ```json, ```, or any other formatting. 88 | """.strip() 89 | 90 | client = self.get_client() 91 | image_data = self.format_image_for_llm(image) 92 | 93 | messages = [ 94 | { 95 | "role": "user", 96 | "content": [ 97 | *image_data, 98 | {"type": "text", "text": prompt}, 99 | ], 100 | } 101 | ] 102 | 103 | total_tries = max_retries + 1 104 | for tries in range(1, total_tries + 1): 105 | try: 106 | response = client.messages.create( 107 | system=system_prompt, 108 | model=self.claude_model_name, 109 | max_tokens=self.max_claude_tokens, 110 | messages=messages, 111 | timeout=timeout, 112 | ) 113 | # Extract and validate response 114 | response_text = response.content[0].text 115 | return self.validate_response(response_text, response_schema) 116 | except (RateLimitError, APITimeoutError) as e: 117 | # Rate limit exceeded 118 | if tries == total_tries: 119 | # Last attempt failed. Give up 120 | logger.error( 121 | f"Rate limit error: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})", 122 | ) 123 | break 124 | else: 125 | wait_time = tries * self.retry_wait_time 126 | logger.warning( 127 | f"Rate limit error: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})", 128 | ) 129 | time.sleep(wait_time) 130 | except Exception as e: 131 | logger.error(f"Error during Claude API call: {e}") 132 | break 133 | 134 | return {} 135 | ``` -------------------------------------------------------------------------------- /CLA.md: -------------------------------------------------------------------------------- ```markdown 1 | Marker Contributor Agreement 2 | 3 | This Marker Contributor Agreement ("MCA") applies to any contribution that you make to any product or project managed by us (the "project"), and sets out the intellectual property rights you grant to us in the contributed materials. The term "us" shall mean Endless Labs, Inc. The term "you" shall mean the person or entity identified below. 4 | 5 | If you agree to be bound by these terms, sign by writing "I have read the CLA document and I hereby sign the CLA" in response to the CLA bot Github comment. Read this agreement carefully before signing. These terms and conditions constitute a binding legal agreement. 6 | 7 | 1. The term 'contribution' or 'contributed materials' means any source code, object code, patch, tool, sample, graphic, specification, manual, documentation, or any other material posted or submitted by you to the project. 8 | 2. With respect to any worldwide copyrights, or copyright applications and registrations, in your contribution: 9 | - you hereby assign to us joint ownership, and to the extent that such assignment is or becomes invalid, ineffective or unenforceable, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty free, unrestricted license to exercise all rights under those copyrights. This includes, at our option, the right to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements, including dual-license structures for commercial customers; 10 | - you agree that each of us can do all things in relation to your contribution as if each of us were the sole owners, and if one of us makes a derivative work of your contribution, the one who makes the derivative work (or has it made will be the sole owner of that derivative work; 11 | - you agree that you will not assert any moral rights in your contribution against us, our licensees or transferees; 12 | - you agree that we may register a copyright in your contribution and exercise all ownership rights associated with it; and 13 | - you agree that neither of us has any duty to consult with, obtain the consent of, pay or render an accounting to the other for any use or distribution of vour contribution. 14 | 3. With respect to any patents you own, or that you can license without payment to any third party, you hereby grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, royalty-free license to: 15 | - make, have made, use, sell, offer to sell, import, and otherwise transfer your contribution in whole or in part, alone or in combination with or included in any product, work or materials arising out of the project to which your contribution was submitted, and 16 | - at our option, to sublicense these same rights to third parties through multiple levels of sublicensees or other licensing arrangements. 17 | If you or your affiliates institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the contribution or any project it was submitted to constitutes direct or contributory patent infringement, then any patent licenses granted to you under this agreement for that contribution shall terminate as of the date such litigation is filed. 18 | 4. Except as set out above, you keep all right, title, and interest in your contribution. The rights that you grant to us under these terms are effective on the date you first submitted a contribution to us, even if your submission took place before the date you sign these terms. Any contribution we make available under any license will also be made available under a suitable FSF (Free Software Foundation) or OSI (Open Source Initiative) approved license. 19 | 5. You covenant, represent, warrant and agree that: 20 | - each contribution that you submit is and shall be an original work of authorship and you can legally grant the rights set out in this MCA; 21 | - to the best of your knowledge, each contribution will not violate any third party's copyrights, trademarks, patents, or other intellectual property rights; and 22 | - each contribution shall be in compliance with U.S. export control laws and other applicable export and import laws. 23 | You agree to notify us if you become aware of any circumstance which would make any of the foregoing representations inaccurate in any respect. Endless Labs, Inc. may publicly disclose your participation in the project, including the fact that you have signed the MCA. 24 | 6. This MCA is governed by the laws of the State of California and applicable U.S. Federal law. Any choice of law rules will not apply. ``` -------------------------------------------------------------------------------- /marker/schema/text/line.py: -------------------------------------------------------------------------------- ```python 1 | import html 2 | import re 3 | from typing import Literal, List 4 | 5 | import regex 6 | 7 | from marker.schema import BlockTypes 8 | from marker.schema.blocks import Block, BlockOutput 9 | 10 | HYPHENS = r"-—¬" 11 | 12 | 13 | def remove_tags(text): 14 | return re.sub(r"<[^>]+>", "", text) 15 | 16 | 17 | def replace_last(string, old, new): 18 | matches = list(re.finditer(old, string)) 19 | if not matches: 20 | return string 21 | last_match = matches[-1] 22 | return string[: last_match.start()] + new + string[last_match.end() :] 23 | 24 | 25 | def strip_trailing_hyphens(line_text, next_line_text, line_html) -> str: 26 | lowercase_letters = r"\p{Ll}" 27 | 28 | hyphen_regex = regex.compile(rf".*[{HYPHENS}]\s?$", regex.DOTALL) 29 | next_line_starts_lowercase = regex.match( 30 | rf"^\s?[{lowercase_letters}]", next_line_text 31 | ) 32 | 33 | if hyphen_regex.match(line_text) and next_line_starts_lowercase: 34 | line_html = replace_last(line_html, rf"[{HYPHENS}]", "") 35 | 36 | return line_html 37 | 38 | 39 | class Line(Block): 40 | block_type: BlockTypes = BlockTypes.Line 41 | block_description: str = "A line of text." 42 | formats: List[Literal["math"]] | None = ( 43 | None # Sometimes we want to set math format at the line level, not span 44 | ) 45 | 46 | def ocr_input_text(self, document): 47 | text = "" 48 | for block in self.contained_blocks(document, (BlockTypes.Span,)): 49 | # We don't include superscripts/subscripts and math since they can be unreliable at this stage 50 | block_text = block.text 51 | if block.italic: 52 | text += f"<i>{block_text}</i>" 53 | elif block.bold: 54 | text += f"<b>{block_text}</b>" 55 | else: 56 | text += block_text 57 | 58 | return text.strip() 59 | 60 | def formatted_text(self, document, skip_urls=False): 61 | text = "" 62 | for block in self.contained_blocks(document, (BlockTypes.Span,)): 63 | block_text = html.escape(block.text) 64 | 65 | if block.has_superscript: 66 | block_text = re.sub(r"^([0-9\W]+)(.*)", r"<sup>\1</sup>\2", block_text) 67 | if "<sup>" not in block_text: 68 | block_text = f"<sup>{block_text}</sup>" 69 | 70 | if block.url and not skip_urls: 71 | block_text = f"<a href='{block.url}'>{block_text}</a>" 72 | 73 | if block.italic: 74 | text += f"<i>{block_text}</i>" 75 | elif block.bold: 76 | text += f"<b>{block_text}</b>" 77 | elif block.math: 78 | text += f"<math display='inline'>{block_text}</math>" 79 | else: 80 | text += block_text 81 | 82 | return text 83 | 84 | def assemble_html(self, document, child_blocks, parent_structure, block_config): 85 | template = "" 86 | for c in child_blocks: 87 | template += c.html 88 | 89 | raw_text = remove_tags(template).strip() 90 | structure_idx = parent_structure.index(self.id) 91 | if structure_idx < len(parent_structure) - 1: 92 | next_block_id = parent_structure[structure_idx + 1] 93 | next_line = document.get_block(next_block_id) 94 | next_line_raw_text = next_line.raw_text(document) 95 | template = strip_trailing_hyphens(raw_text, next_line_raw_text, template) 96 | else: 97 | template = template.strip( 98 | " " 99 | ) # strip any trailing whitespace from the last line 100 | return template 101 | 102 | def render( 103 | self, document, parent_structure, section_hierarchy=None, block_config=None 104 | ): 105 | child_content = [] 106 | if self.structure is not None and len(self.structure) > 0: 107 | for block_id in self.structure: 108 | block = document.get_block(block_id) 109 | child_content.append( 110 | block.render( 111 | document, parent_structure, section_hierarchy, block_config 112 | ) 113 | ) 114 | 115 | return BlockOutput( 116 | html=self.assemble_html( 117 | document, child_content, parent_structure, block_config 118 | ), 119 | polygon=self.polygon, 120 | id=self.id, 121 | children=[], 122 | section_hierarchy=section_hierarchy, 123 | ) 124 | 125 | def merge(self, other: "Line"): 126 | self.polygon = self.polygon.merge([other.polygon]) 127 | 128 | # Handle merging structure with Nones 129 | if self.structure is None: 130 | self.structure = other.structure 131 | elif other.structure is not None: 132 | self.structure = self.structure + other.structure 133 | 134 | # Merge formats with Nones 135 | if self.formats is None: 136 | self.formats = other.formats 137 | elif other.formats is not None: 138 | self.formats = list(set(self.formats + other.formats)) 139 | ``` -------------------------------------------------------------------------------- /marker/processors/list.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Annotated, List, Tuple 2 | 3 | from marker.processors import BaseProcessor 4 | from marker.schema import BlockTypes 5 | from marker.schema.blocks import ListItem 6 | from marker.schema.document import Document 7 | 8 | 9 | class ListProcessor(BaseProcessor): 10 | """ 11 | A processor for merging lists across pages and columns 12 | """ 13 | block_types = (BlockTypes.ListGroup,) 14 | ignored_block_types: Annotated[ 15 | Tuple[BlockTypes], 16 | "The list of block types to ignore when merging lists.", 17 | ] = (BlockTypes.PageHeader, BlockTypes.PageFooter) 18 | min_x_indent: Annotated[ 19 | float, "The minimum horizontal indentation required to consider a block as a nested list item.", 20 | "This is expressed as a percentage of the page width and is used to determine hierarchical relationships within a list.", 21 | ] = 0.01 22 | 23 | def __init__(self, config): 24 | super().__init__(config) 25 | 26 | def __call__(self, document: Document): 27 | self.list_group_continuation(document) 28 | self.list_group_indentation(document) 29 | 30 | def list_group_continuation(self, document: Document): 31 | for page in document.pages: 32 | for block in page.contained_blocks(document, self.block_types): 33 | next_block = document.get_next_block(block, self.ignored_block_types) 34 | if next_block is None: 35 | continue 36 | if next_block.block_type not in self.block_types: 37 | continue 38 | if next_block.structure is None: 39 | continue 40 | if next_block.ignore_for_output: 41 | continue 42 | 43 | column_break, page_break = False, False 44 | next_block_in_first_quadrant = False 45 | 46 | if next_block.page_id == block.page_id: # block on the same page 47 | # we check for a column break 48 | column_break = next_block.polygon.y_start <= block.polygon.y_end 49 | else: 50 | page_break = True 51 | next_page = document.get_page(next_block.page_id) 52 | next_block_in_first_quadrant = (next_block.polygon.x_start < next_page.polygon.width // 2) and \ 53 | (next_block.polygon.y_start < next_page.polygon.height // 2) 54 | 55 | block.has_continuation = column_break or (page_break and next_block_in_first_quadrant) 56 | 57 | def list_group_indentation(self, document: Document): 58 | for page in document.pages: 59 | for block in page.contained_blocks(document, self.block_types): 60 | if block.structure is None: 61 | continue 62 | if block.ignore_for_output: 63 | continue 64 | 65 | stack: List[ListItem] = [block.get_next_block(page, None)] 66 | for list_item_id in block.structure: 67 | list_item_block: ListItem = page.get_block(list_item_id) 68 | 69 | # This can be a line sometimes 70 | if list_item_block.block_type != BlockTypes.ListItem: 71 | continue 72 | 73 | while stack and list_item_block.polygon.x_start <= stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width): 74 | stack.pop() 75 | 76 | if stack and list_item_block.polygon.y_start > stack[-1].polygon.y_start: 77 | list_item_block.list_indent_level = stack[-1].list_indent_level 78 | if list_item_block.polygon.x_start > stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width): 79 | list_item_block.list_indent_level += 1 80 | 81 | next_list_item_block = block.get_next_block(page, list_item_block) 82 | if next_list_item_block is not None and next_list_item_block.polygon.x_start > list_item_block.polygon.x_end: 83 | stack = [next_list_item_block] # reset stack on column breaks 84 | else: 85 | stack.append(list_item_block) 86 | 87 | stack: List[ListItem] = [block.get_next_block(page, None)] 88 | for list_item_id in block.structure.copy(): 89 | list_item_block: ListItem = page.get_block(list_item_id) 90 | 91 | while stack and list_item_block.list_indent_level <= stack[-1].list_indent_level: 92 | stack.pop() 93 | 94 | if stack: 95 | current_parent = stack[-1] 96 | current_parent.add_structure(list_item_block) 97 | current_parent.polygon = current_parent.polygon.merge([list_item_block.polygon]) 98 | 99 | block.remove_structure_items([list_item_id]) 100 | stack.append(list_item_block) 101 | ``` -------------------------------------------------------------------------------- /marker/config/crawler.py: -------------------------------------------------------------------------------- ```python 1 | import importlib 2 | import inspect 3 | import pkgutil 4 | from functools import cached_property 5 | from typing import Annotated, Dict, Set, Type, get_args, get_origin 6 | 7 | from marker.builders import BaseBuilder 8 | from marker.converters import BaseConverter 9 | from marker.extractors import BaseExtractor 10 | from marker.processors import BaseProcessor 11 | from marker.providers import BaseProvider 12 | from marker.renderers import BaseRenderer 13 | from marker.services import BaseService 14 | 15 | 16 | class ConfigCrawler: 17 | def __init__( 18 | self, 19 | base_classes=( 20 | BaseBuilder, 21 | BaseProcessor, 22 | BaseConverter, 23 | BaseProvider, 24 | BaseRenderer, 25 | BaseService, 26 | BaseExtractor, 27 | ), 28 | ): 29 | self.base_classes = base_classes 30 | self.class_config_map: Dict[str, dict] = {} 31 | 32 | self._crawl_config() 33 | 34 | def _crawl_config(self): 35 | for base in self.base_classes: 36 | base_class_type = base.__name__.removeprefix("Base") 37 | self.class_config_map.setdefault(base_class_type, {}) 38 | for class_name, class_type in self._find_subclasses(base).items(): 39 | if class_name.startswith("Base"): 40 | continue 41 | 42 | self.class_config_map[base_class_type].setdefault( 43 | class_name, {"class_type": class_type, "config": {}} 44 | ) 45 | for attr, attr_type in self._gather_super_annotations( 46 | class_type 47 | ).items(): 48 | default = getattr(class_type, attr) 49 | metadata = (f"Default is {default}.",) 50 | 51 | if get_origin(attr_type) is Annotated: 52 | if any("Default" in desc for desc in attr_type.__metadata__): 53 | metadata = attr_type.__metadata__ 54 | else: 55 | metadata = attr_type.__metadata__ + metadata 56 | attr_type = get_args(attr_type)[0] 57 | 58 | formatted_type = self._format_type(attr_type) 59 | self.class_config_map[base_class_type][class_name]["config"][ 60 | attr 61 | ] = (attr_type, formatted_type, default, metadata) 62 | 63 | @staticmethod 64 | def _gather_super_annotations(cls: Type) -> Dict[str, Type]: 65 | """ 66 | Collect all annotated attributes from `cls` and its superclasses, bottom-up. 67 | Subclass attributes overwrite superclass attributes with the same name. 68 | """ 69 | # We'll walk the MRO from base -> derived so subclass attributes overwrite 70 | # the same attribute name from superclasses. 71 | annotations = {} 72 | for base in reversed(cls.__mro__): 73 | if base is object: 74 | continue 75 | if hasattr(base, "__annotations__"): 76 | for name, annotation in base.__annotations__.items(): 77 | annotations[name] = annotation 78 | return annotations 79 | 80 | @cached_property 81 | def attr_counts(self) -> Dict[str, int]: 82 | counts: Dict[str, int] = {} 83 | for base_type_dict in self.class_config_map.values(): 84 | for class_map in base_type_dict.values(): 85 | for attr in class_map["config"].keys(): 86 | counts[attr] = counts.get(attr, 0) + 1 87 | return counts 88 | 89 | @cached_property 90 | def attr_set(self) -> Set[str]: 91 | attr_set: Set[str] = set() 92 | for base_type_dict in self.class_config_map.values(): 93 | for class_name, class_map in base_type_dict.items(): 94 | for attr in class_map["config"].keys(): 95 | attr_set.add(attr) 96 | attr_set.add(f"{class_name}_{attr}") 97 | return attr_set 98 | 99 | def _find_subclasses(self, base_class): 100 | subclasses = {} 101 | module_name = base_class.__module__ 102 | package = importlib.import_module(module_name) 103 | if hasattr(package, "__path__"): 104 | for _, module_name, _ in pkgutil.walk_packages( 105 | package.__path__, module_name + "." 106 | ): 107 | try: 108 | module = importlib.import_module(module_name) 109 | for name, obj in inspect.getmembers(module, inspect.isclass): 110 | if issubclass(obj, base_class) and obj is not base_class: 111 | subclasses[name] = obj 112 | except ImportError: 113 | pass 114 | return subclasses 115 | 116 | def _format_type(self, t: Type) -> str: 117 | """Format a typing type like Optional[int] into a readable string.""" 118 | 119 | if get_origin(t): # Handle Optional and types with origins separately 120 | return f"{t}".removeprefix("typing.") 121 | else: # Regular types like int, str 122 | return t.__name__ 123 | 124 | 125 | crawler = ConfigCrawler() 126 | ``` -------------------------------------------------------------------------------- /marker/processors/line_merge.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Annotated, List 2 | 3 | from marker.processors import BaseProcessor 4 | from marker.schema import BlockTypes 5 | from marker.schema.blocks import Block 6 | from marker.schema.document import Document 7 | from marker.schema.text import Line 8 | from marker.util import matrix_intersection_area 9 | 10 | 11 | class LineMergeProcessor(BaseProcessor): 12 | """ 13 | A processor for merging inline math lines. 14 | """ 15 | block_types = (BlockTypes.Text, BlockTypes.TextInlineMath, BlockTypes.Caption, BlockTypes.Footnote, BlockTypes.SectionHeader) 16 | min_merge_pct: Annotated[ 17 | float, 18 | "The minimum percentage of intersection area to consider merging." 19 | ] = .015 20 | block_expand_threshold: Annotated[ 21 | float, 22 | "The percentage of the block width to expand the bounding box." 23 | ] = .05 24 | min_merge_ydist: Annotated[ 25 | float, 26 | "The minimum y distance between lines to consider merging." 27 | ] = 5 28 | intersection_pct_threshold: Annotated[ 29 | float, 30 | "The total amount of intersection area concentrated in the max intersection block." 31 | ] = .5 32 | vertical_overlap_pct_threshold: Annotated[ 33 | float, 34 | "The minimum percentage of vertical overlap to consider merging." 35 | ] = .8 36 | use_llm: Annotated[ 37 | bool, 38 | "Whether to use LLMs to improve accuracy." 39 | ] = False 40 | 41 | def __init__(self, config): 42 | super().__init__(config) 43 | 44 | def merge_lines(self, lines: List[Line], block: Block): 45 | lines = [l for l in lines if l.polygon.width * 5 > l.polygon.height] # Skip vertical lines 46 | line_bboxes = [l.polygon.expand(self.block_expand_threshold, 0).bbox for l in lines] # Expand horizontally 47 | intersections = matrix_intersection_area(line_bboxes, line_bboxes) 48 | 49 | merges = [] 50 | merge = [] 51 | for i in range(len(line_bboxes)): 52 | intersection_row = intersections[i] 53 | intersection_row[i] = 0 # Zero out the current idx 54 | 55 | if i < len(line_bboxes) - 1: 56 | intersection_row[i+1] = 0 # Zero out the next idx, so we only evaluate merge from the left 57 | 58 | if len(merge) == 0: 59 | merge.append(i) 60 | continue 61 | 62 | # Zero out previous merge segments 63 | merge_intersection = sum([intersection_row[m] for m in merge]) 64 | line_area = lines[i].polygon.area 65 | intersection_pct = merge_intersection / max(1, line_area) 66 | 67 | total_intersection = max(1, sum(intersection_row)) 68 | 69 | line_start = lines[merge[0]].polygon.y_start 70 | line_end = lines[merge[0]].polygon.y_end 71 | 72 | vertical_overlap_start = max(line_start, lines[i].polygon.y_start) 73 | vertical_overlap_end = min(line_end, lines[i].polygon.y_end) 74 | vertical_overlap = max(0, vertical_overlap_end - vertical_overlap_start) 75 | vertical_overlap_pct = vertical_overlap / max(1, lines[i].polygon.height) 76 | 77 | if all([ 78 | # Overlaps enough 79 | intersection_pct >= self.min_merge_pct, 80 | # Within same line 81 | vertical_overlap_pct > self.vertical_overlap_pct_threshold, 82 | # doesn't overlap with anything else 83 | merge_intersection / total_intersection > self.intersection_pct_threshold 84 | ]): 85 | merge.append(i) 86 | else: 87 | merges.append(merge) 88 | merge = [] 89 | 90 | if merge: 91 | merges.append(merge) 92 | 93 | merges = [m for m in merges if len(m) > 1] 94 | merged = set() 95 | for merge in merges: 96 | merge = [m for m in merge if m not in merged] 97 | if len(merge) < 2: 98 | continue 99 | 100 | line: Line = lines[merge[0]] 101 | merged.add(merge[0]) 102 | for idx in merge[1:]: 103 | other_line: Line = lines[idx] 104 | line.merge(other_line) 105 | block.structure.remove(other_line.id) 106 | other_line.removed = True # Mark line as removed 107 | merged.add(idx) 108 | 109 | # It is probably math if we are merging provider lines like this 110 | if not line.formats: 111 | line.formats = ["math"] 112 | elif "math" not in line.formats: 113 | line.formats.append("math") 114 | 115 | 116 | def __call__(self, document: Document): 117 | # Merging lines only needed for inline math 118 | if not self.use_llm: 119 | return 120 | 121 | for page in document.pages: 122 | for block in page.contained_blocks(document, self.block_types): 123 | if block.structure is None: 124 | continue 125 | 126 | if not len(block.structure) >= 2: # Skip single lines 127 | continue 128 | 129 | lines = block.contained_blocks(document, (BlockTypes.Line,)) 130 | self.merge_lines(lines, block) 131 | ``` -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- ```python 1 | import tempfile 2 | from typing import Dict, Type 3 | 4 | from PIL import Image, ImageDraw 5 | 6 | import datasets 7 | import pytest 8 | 9 | from marker.builders.document import DocumentBuilder 10 | from marker.builders.layout import LayoutBuilder 11 | from marker.builders.line import LineBuilder 12 | from marker.builders.ocr import OcrBuilder 13 | from marker.builders.structure import StructureBuilder 14 | from marker.converters.pdf import PdfConverter 15 | from marker.models import create_model_dict 16 | from marker.providers.registry import provider_from_filepath 17 | from marker.renderers.chunk import ChunkRenderer 18 | from marker.renderers.html import HTMLRenderer 19 | from marker.schema import BlockTypes 20 | from marker.schema.blocks import Block 21 | from marker.renderers.markdown import MarkdownRenderer 22 | from marker.renderers.json import JSONRenderer 23 | from marker.schema.registry import register_block_class 24 | from marker.util import classes_to_strings, strings_to_classes 25 | 26 | 27 | @pytest.fixture(scope="session") 28 | def model_dict(): 29 | model_dict = create_model_dict() 30 | yield model_dict 31 | del model_dict 32 | 33 | 34 | @pytest.fixture(scope="session") 35 | def layout_model(model_dict): 36 | yield model_dict["layout_model"] 37 | 38 | 39 | @pytest.fixture(scope="session") 40 | def detection_model(model_dict): 41 | yield model_dict["detection_model"] 42 | 43 | 44 | @pytest.fixture(scope="session") 45 | def recognition_model(model_dict): 46 | yield model_dict["recognition_model"] 47 | 48 | 49 | @pytest.fixture(scope="session") 50 | def table_rec_model(model_dict): 51 | yield model_dict["table_rec_model"] 52 | 53 | 54 | @pytest.fixture(scope="session") 55 | def ocr_error_model(model_dict): 56 | yield model_dict["ocr_error_model"] 57 | 58 | 59 | @pytest.fixture(scope="function") 60 | def config(request): 61 | config_mark = request.node.get_closest_marker("config") 62 | config = config_mark.args[0] if config_mark else {} 63 | 64 | override_map: Dict[BlockTypes, Type[Block]] = config.get("override_map", {}) 65 | for block_type, override_block_type in override_map.items(): 66 | register_block_class(block_type, override_block_type) 67 | 68 | return config 69 | 70 | 71 | @pytest.fixture(scope="session") 72 | def pdf_dataset(): 73 | return datasets.load_dataset("datalab-to/pdfs", split="train") 74 | 75 | 76 | @pytest.fixture(scope="function") 77 | def temp_doc(request, pdf_dataset): 78 | filename_mark = request.node.get_closest_marker("filename") 79 | filename = filename_mark.args[0] if filename_mark else "adversarial.pdf" 80 | 81 | idx = pdf_dataset["filename"].index(filename) 82 | suffix = filename.split(".")[-1] 83 | 84 | temp_pdf = tempfile.NamedTemporaryFile(suffix=f".{suffix}") 85 | temp_pdf.write(pdf_dataset["pdf"][idx]) 86 | temp_pdf.flush() 87 | yield temp_pdf 88 | 89 | 90 | @pytest.fixture(scope="function") 91 | def doc_provider(request, config, temp_doc): 92 | provider_cls = provider_from_filepath(temp_doc.name) 93 | yield provider_cls(temp_doc.name, config) 94 | 95 | 96 | @pytest.fixture(scope="function") 97 | def pdf_document( 98 | request, 99 | config, 100 | doc_provider, 101 | layout_model, 102 | ocr_error_model, 103 | recognition_model, 104 | detection_model, 105 | ): 106 | layout_builder = LayoutBuilder(layout_model, config) 107 | line_builder = LineBuilder(detection_model, ocr_error_model, config) 108 | ocr_builder = OcrBuilder(recognition_model, config) 109 | builder = DocumentBuilder(config) 110 | structure_builder = StructureBuilder(config) 111 | document = builder(doc_provider, layout_builder, line_builder, ocr_builder) 112 | structure_builder(document) 113 | yield document 114 | 115 | 116 | @pytest.fixture(scope="function") 117 | def pdf_converter(request, config, model_dict, renderer, llm_service): 118 | if llm_service: 119 | llm_service = classes_to_strings([llm_service])[0] 120 | yield PdfConverter( 121 | artifact_dict=model_dict, 122 | processor_list=None, 123 | renderer=classes_to_strings([renderer])[0], 124 | config=config, 125 | llm_service=llm_service, 126 | ) 127 | 128 | 129 | @pytest.fixture(scope="function") 130 | def renderer(request, config): 131 | if request.node.get_closest_marker("output_format"): 132 | output_format = request.node.get_closest_marker("output_format").args[0] 133 | if output_format == "markdown": 134 | return MarkdownRenderer 135 | elif output_format == "json": 136 | return JSONRenderer 137 | elif output_format == "html": 138 | return HTMLRenderer 139 | elif output_format == "chunks": 140 | return ChunkRenderer 141 | else: 142 | raise ValueError(f"Unknown output format: {output_format}") 143 | else: 144 | return MarkdownRenderer 145 | 146 | 147 | @pytest.fixture(scope="function") 148 | def llm_service(request, config): 149 | llm_service = config.get("llm_service") 150 | if not llm_service: 151 | yield None 152 | else: 153 | yield strings_to_classes([llm_service])[0] 154 | 155 | 156 | @pytest.fixture(scope="function") 157 | def temp_image(): 158 | img = Image.new("RGB", (512, 512), color="white") 159 | draw = ImageDraw.Draw(img) 160 | draw.text((200, 200), "Hello, World!", fill="black", font_size=36) 161 | with tempfile.NamedTemporaryFile(suffix=".png") as f: 162 | img.save(f.name) 163 | f.flush() 164 | yield f 165 | ``` -------------------------------------------------------------------------------- /marker/extractors/document.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | 3 | from pydantic import BaseModel 4 | from typing import Annotated, Optional, List 5 | 6 | from marker.extractors import BaseExtractor 7 | from marker.extractors.page import PageExtractionSchema 8 | from marker.logger import get_logger 9 | 10 | logger = get_logger() 11 | 12 | 13 | class DocumentExtractionSchema(BaseModel): 14 | analysis: str 15 | document_json: str 16 | 17 | 18 | class DocumentExtractor(BaseExtractor): 19 | """ 20 | An extractor that combines data from across all pages. 21 | """ 22 | 23 | page_schema: Annotated[ 24 | str, 25 | "The JSON schema to be extracted from the page.", 26 | ] = "" 27 | 28 | page_extraction_prompt = """You are an expert document analyst who reads documents and pulls data out in JSON format. You will receive your detailed notes from all the pages of a document, and a JSON schema that we want to extract from the document. Your task is to extract all the information properly into the JSON schema. 29 | 30 | Some notes: 31 | - The schema may contain a single object to extract from the entire document, or an array of objects. 32 | - The schema may contain nested objects, arrays, and other complex structures. 33 | 34 | Some guidelines: 35 | - Some entities will span multiple pages, so make sure to consult your notes thoroughly. 36 | - In the case of potential conflicting values, pull out the values you have the most confidence in, from your notes. 37 | - If you cannot find a value for a field, leave it blank in the JSON. 38 | 39 | **Instructions:** 40 | 1. Analyze your provided notes. 41 | 2. Analyze the JSON schema. 42 | 3. Write a detailed analysis of the notes, and the associated values in the schema. Make sure to reference which page each piece of information comes from. 43 | 4. Write the output in the JSON schema format, ensuring all required fields are filled out. Output only the json data, without any additional text or formatting. 44 | 45 | **Example:** 46 | Input: 47 | 48 | Detailed Notes 49 | Page 0 50 | On this page, I see a table with car makes and sales. The makes are Honda and Toyota, with sales of 100 and 200 respectively. The color is not present in the table, so I will leave it blank in the JSON. That information may be present on another page. Some JSON snippets I may find useful later are: 51 | ```json 52 | { 53 | "make": "Honda", 54 | "sales": 100, 55 | } 56 | ``` 57 | ```json 58 | { 59 | "make": "Toyota", 60 | "sales": 200, 61 | } 62 | ``` 63 | 64 | Honda is the first row in the table, and Toyota is the second row. Make is the first column, and sales is the second. 65 | 66 | Page 1 67 | I see a table that contains 2 rows, and has a color header. The first row has the color red, and the second row has the color blue. Here are some useful snippets: 68 | 69 | Schema 70 | 71 | ```json 72 | {'$defs': {'Cars': {'properties': {'make': {'title': 'Make', 'type': 'string'}, 'sales': {'title': 'Sales', 'type': 'integer'}, 'color': {'title': 'Color', 'type': 'string'}}, 'required': ['make', 'sales', 'color'], 'title': 'Cars', 'type': 'object'}}, 'properties': {'cars': {'items': {'$ref': '#/$defs/Cars'}, 'title': 'Cars', 'type': 'array'}}, 'required': ['cars'], 'title': 'CarsList', 'type': 'object'} 73 | ``` 74 | 75 | Output: 76 | 77 | Analysis: From the notes, it looks like the information I need is in a table that spans 2 pages. The first page has the makes and sales, while the second page has the colors. I will combine this information into the JSON schema. 78 | JSON 79 | 80 | { 81 | "cars": [ 82 | { 83 | "make": "Honda", 84 | "sales": 100, 85 | "color": "red" 86 | }, 87 | { 88 | "make": "Toyota", 89 | "sales": 200, 90 | "color": "blue" 91 | } 92 | ] 93 | } 94 | 95 | **Input:** 96 | 97 | Detailed Notes 98 | {{document_notes}} 99 | 100 | Schema 101 | ```json 102 | {{schema}} 103 | ``` 104 | """ 105 | 106 | def assemble_document_notes(self, page_notes: List[PageExtractionSchema]) -> str: 107 | notes = "" 108 | for i, page_schema in enumerate(page_notes): 109 | if not page_notes: 110 | continue 111 | notes += f"Page {i + 1}\n{page_schema.detailed_notes}\n\n" 112 | return notes.strip() 113 | 114 | def __call__( 115 | self, 116 | page_notes: List[PageExtractionSchema], 117 | **kwargs, 118 | ) -> Optional[DocumentExtractionSchema]: 119 | if not self.page_schema: 120 | raise ValueError( 121 | "Page schema must be defined for structured extraction to work." 122 | ) 123 | 124 | prompt = self.page_extraction_prompt.replace( 125 | "{{document_notes}}", self.assemble_document_notes(page_notes) 126 | ).replace("{{schema}}", json.dumps(self.page_schema)) 127 | response = self.llm_service(prompt, None, None, DocumentExtractionSchema) 128 | 129 | logger.debug(f"Document extraction response: {response}") 130 | 131 | if not response or any( 132 | [ 133 | key not in response 134 | for key in [ 135 | "analysis", 136 | "document_json", 137 | ] 138 | ] 139 | ): 140 | return None 141 | 142 | json_data = response["document_json"].strip().lstrip("```json").rstrip("```") 143 | 144 | return DocumentExtractionSchema( 145 | analysis=response["analysis"], document_json=json_data 146 | ) 147 | ``` -------------------------------------------------------------------------------- /marker/scripts/server.py: -------------------------------------------------------------------------------- ```python 1 | import traceback 2 | 3 | import click 4 | import os 5 | 6 | from pydantic import BaseModel, Field 7 | from starlette.responses import HTMLResponse 8 | 9 | from marker.config.parser import ConfigParser 10 | from marker.output import text_from_rendered 11 | 12 | import base64 13 | from contextlib import asynccontextmanager 14 | from typing import Optional, Annotated 15 | import io 16 | 17 | from fastapi import FastAPI, Form, File, UploadFile 18 | from marker.converters.pdf import PdfConverter 19 | from marker.models import create_model_dict 20 | from marker.settings import settings 21 | 22 | app_data = {} 23 | 24 | 25 | UPLOAD_DIRECTORY = "./uploads" 26 | os.makedirs(UPLOAD_DIRECTORY, exist_ok=True) 27 | 28 | 29 | @asynccontextmanager 30 | async def lifespan(app: FastAPI): 31 | app_data["models"] = create_model_dict() 32 | 33 | yield 34 | 35 | if "models" in app_data: 36 | del app_data["models"] 37 | 38 | 39 | app = FastAPI(lifespan=lifespan) 40 | 41 | 42 | @app.get("/") 43 | async def root(): 44 | return HTMLResponse( 45 | """ 46 | <h1>Marker API</h1> 47 | <ul> 48 | <li><a href="/docs">API Documentation</a></li> 49 | <li><a href="/marker">Run marker (post request only)</a></li> 50 | </ul> 51 | """ 52 | ) 53 | 54 | 55 | class CommonParams(BaseModel): 56 | filepath: Annotated[ 57 | Optional[str], Field(description="The path to the PDF file to convert.") 58 | ] 59 | page_range: Annotated[ 60 | Optional[str], 61 | Field( 62 | description="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20", 63 | example=None, 64 | ), 65 | ] = None 66 | force_ocr: Annotated[ 67 | bool, 68 | Field( 69 | description="Force OCR on all pages of the PDF. Defaults to False. This can lead to worse results if you have good text in your PDFs (which is true in most cases)." 70 | ), 71 | ] = False 72 | paginate_output: Annotated[ 73 | bool, 74 | Field( 75 | description="Whether to paginate the output. Defaults to False. If set to True, each page of the output will be separated by a horizontal rule that contains the page number (2 newlines, {PAGE_NUMBER}, 48 - characters, 2 newlines)." 76 | ), 77 | ] = False 78 | output_format: Annotated[ 79 | str, 80 | Field( 81 | description="The format to output the text in. Can be 'markdown', 'json', or 'html'. Defaults to 'markdown'." 82 | ), 83 | ] = "markdown" 84 | 85 | 86 | async def _convert_pdf(params: CommonParams): 87 | assert params.output_format in ["markdown", "json", "html", "chunks"], ( 88 | "Invalid output format" 89 | ) 90 | try: 91 | options = params.model_dump() 92 | config_parser = ConfigParser(options) 93 | config_dict = config_parser.generate_config_dict() 94 | config_dict["pdftext_workers"] = 1 95 | converter_cls = PdfConverter 96 | converter = converter_cls( 97 | config=config_dict, 98 | artifact_dict=app_data["models"], 99 | processor_list=config_parser.get_processors(), 100 | renderer=config_parser.get_renderer(), 101 | llm_service=config_parser.get_llm_service(), 102 | ) 103 | rendered = converter(params.filepath) 104 | text, _, images = text_from_rendered(rendered) 105 | metadata = rendered.metadata 106 | except Exception as e: 107 | traceback.print_exc() 108 | return { 109 | "success": False, 110 | "error": str(e), 111 | } 112 | 113 | encoded = {} 114 | for k, v in images.items(): 115 | byte_stream = io.BytesIO() 116 | v.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT) 117 | encoded[k] = base64.b64encode(byte_stream.getvalue()).decode( 118 | settings.OUTPUT_ENCODING 119 | ) 120 | 121 | return { 122 | "format": params.output_format, 123 | "output": text, 124 | "images": encoded, 125 | "metadata": metadata, 126 | "success": True, 127 | } 128 | 129 | 130 | @app.post("/marker") 131 | async def convert_pdf(params: CommonParams): 132 | return await _convert_pdf(params) 133 | 134 | 135 | @app.post("/marker/upload") 136 | async def convert_pdf_upload( 137 | page_range: Optional[str] = Form(default=None), 138 | force_ocr: Optional[bool] = Form(default=False), 139 | paginate_output: Optional[bool] = Form(default=False), 140 | output_format: Optional[str] = Form(default="markdown"), 141 | file: UploadFile = File( 142 | ..., description="The PDF file to convert.", media_type="application/pdf" 143 | ), 144 | ): 145 | upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename) 146 | with open(upload_path, "wb+") as upload_file: 147 | file_contents = await file.read() 148 | upload_file.write(file_contents) 149 | 150 | params = CommonParams( 151 | filepath=upload_path, 152 | page_range=page_range, 153 | force_ocr=force_ocr, 154 | paginate_output=paginate_output, 155 | output_format=output_format, 156 | ) 157 | results = await _convert_pdf(params) 158 | os.remove(upload_path) 159 | return results 160 | 161 | 162 | @click.command() 163 | @click.option("--port", type=int, default=8000, help="Port to run the server on") 164 | @click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on") 165 | def server_cli(port: int, host: str): 166 | import uvicorn 167 | 168 | # Run the server 169 | uvicorn.run( 170 | app, 171 | host=host, 172 | port=port, 173 | ) 174 | ``` -------------------------------------------------------------------------------- /marker/processors/equation.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Annotated, List, Tuple 2 | from PIL import Image 3 | import re 4 | from bs4 import BeautifulSoup 5 | 6 | from ftfy import fix_text, TextFixerConfig 7 | from surya.recognition import RecognitionPredictor, OCRResult 8 | 9 | from marker.processors import BaseProcessor 10 | from marker.schema import BlockTypes 11 | from marker.schema.document import Document 12 | from marker.settings import settings 13 | 14 | MATH_TAG_PATTERN = re.compile(r"<math[^>]*>(.*?)</math>") 15 | 16 | 17 | class EquationProcessor(BaseProcessor): 18 | """ 19 | A processor for recognizing equations in the document. 20 | """ 21 | 22 | block_types: Annotated[ 23 | Tuple[BlockTypes], 24 | "The block types to process.", 25 | ] = (BlockTypes.Equation,) 26 | model_max_length: Annotated[ 27 | int, 28 | "The maximum number of tokens to allow for the Recognition model.", 29 | ] = 1024 30 | equation_batch_size: Annotated[ 31 | int, 32 | "The batch size to use for the recognition model while processing equations.", 33 | "Default is None, which will use the default batch size for the model.", 34 | ] = None 35 | disable_tqdm: Annotated[ 36 | bool, 37 | "Whether to disable the tqdm progress bar.", 38 | ] = False 39 | drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False 40 | 41 | def __init__(self, recognition_model: RecognitionPredictor, config=None): 42 | super().__init__(config) 43 | 44 | self.recognition_model = recognition_model 45 | 46 | def get_batch_size(self): 47 | # Set to 1/4th of OCR batch size due to sequence length with tiling 48 | if self.equation_batch_size is not None: 49 | return self.equation_batch_size 50 | elif settings.TORCH_DEVICE_MODEL == "cuda": 51 | return 32 52 | elif settings.TORCH_DEVICE_MODEL == "mps": 53 | return 6 54 | return 6 55 | 56 | def __call__(self, document: Document): 57 | images = [] 58 | equation_boxes = [] 59 | equation_block_ids = [] 60 | total_equation_blocks = 0 61 | 62 | for page in document.pages: 63 | page_image = page.get_image(highres=True) 64 | page_size = page.polygon.width, page.polygon.height 65 | image_size = page_image.size 66 | 67 | page_equation_boxes = [] 68 | page_equation_block_ids = [] 69 | equation_blocks = page.contained_blocks(document, self.block_types) 70 | for block in equation_blocks: 71 | page_equation_boxes.append( 72 | block.polygon.rescale(page_size, image_size).bbox 73 | ) 74 | page_equation_block_ids.append(block.id) 75 | total_equation_blocks += 1 76 | 77 | images.append(page_image) 78 | equation_boxes.append(page_equation_boxes) 79 | equation_block_ids.append(page_equation_block_ids) 80 | 81 | if total_equation_blocks == 0: 82 | return 83 | 84 | predictions = self.get_latex_batched(images, equation_boxes) 85 | for page_predictions, page_equation_block_ids in zip( 86 | predictions, equation_block_ids 87 | ): 88 | assert len(page_predictions) == len(page_equation_block_ids), ( 89 | "Every equation block should have a corresponding prediction" 90 | ) 91 | for block_prediction, block_id in zip( 92 | page_predictions, page_equation_block_ids 93 | ): 94 | block = document.get_block(block_id) 95 | block.html = self.fix_latex(block_prediction) 96 | 97 | def fix_latex(self, math_html: str): 98 | math_html = math_html.strip() 99 | soup = BeautifulSoup(math_html, "html.parser") 100 | opening_math_tag = soup.find("math") 101 | 102 | # No math block found 103 | if not opening_math_tag: 104 | return "" 105 | 106 | # Force block format 107 | opening_math_tag.attrs["display"] = "block" 108 | fixed_math_html = str(soup) 109 | 110 | # Sometimes model outputs newlines at the beginning/end of tags 111 | fixed_math_html = re.sub( 112 | r"^<math display=\"block\">\\n(?![a-zA-Z])", 113 | '<math display="block">', 114 | fixed_math_html, 115 | ) 116 | fixed_math_html = re.sub(r"\\n</math>$", "</math>", fixed_math_html) 117 | fixed_math_html = re.sub(r"<br>", "", fixed_math_html) 118 | fixed_math_html = fix_text( 119 | fixed_math_html, config=TextFixerConfig(unescape_html=True) 120 | ) 121 | return fixed_math_html 122 | 123 | def get_latex_batched( 124 | self, 125 | page_images: List[Image.Image], 126 | bboxes: List[List[List[float]]], 127 | ): 128 | self.recognition_model.disable_tqdm = self.disable_tqdm 129 | predictions: List[OCRResult] = self.recognition_model( 130 | images=page_images, 131 | bboxes=bboxes, 132 | task_names=["ocr_with_boxes"] * len(page_images), 133 | recognition_batch_size=self.get_batch_size(), 134 | sort_lines=False, 135 | drop_repeated_text=self.drop_repeated_text, 136 | max_tokens=2048, 137 | max_sliding_window=2148, 138 | ) 139 | 140 | equation_predictions = [ 141 | [line.text.strip() for line in page_prediction.text_lines] 142 | for page_prediction in predictions 143 | ] 144 | 145 | return equation_predictions 146 | ``` -------------------------------------------------------------------------------- /marker/processors/llm/llm_equation.py: -------------------------------------------------------------------------------- ```python 1 | from pydantic import BaseModel 2 | 3 | from marker.processors.llm import BaseLLMSimpleBlockProcessor, PromptData, BlockData 4 | from marker.schema import BlockTypes 5 | from marker.schema.document import Document 6 | 7 | from typing import Annotated, List 8 | 9 | 10 | class LLMEquationProcessor(BaseLLMSimpleBlockProcessor): 11 | block_types = (BlockTypes.Equation,) 12 | min_equation_height: Annotated[ 13 | float, 14 | "The minimum ratio between equation height and page height to consider for processing.", 15 | ] = 0.06 16 | image_expansion_ratio: Annotated[ 17 | float, 18 | "The ratio to expand the image by when cropping.", 19 | ] = 0.05 # Equations sometimes get bboxes that are too tight 20 | redo_inline_math: Annotated[ 21 | bool, 22 | "Whether to redo inline math blocks.", 23 | ] = False 24 | equation_latex_prompt: Annotated[ 25 | str, 26 | "The prompt to use for generating LaTeX from equations.", 27 | "Default is a string containing the Gemini prompt." 28 | ] = r"""You're an expert mathematician who is good at writing LaTeX code and html for equations. 29 | You'll receive an image of a math block, along with the text extracted from the block. It may contain one or more equations. Your job is to write html that represents the content of the image, with the equations in LaTeX format. 30 | 31 | Some guidelines: 32 | - Output valid html, where all the equations can render properly. 33 | - Use <math display="block"> as a block equation delimiter and <math> for inline equations. Do not use $ or $$ as delimiters. 34 | - Keep the LaTeX code inside the math tags simple, concise, and KaTeX compatible. 35 | - Enclose all equations in the correct math tags. Use multiple math tags inside the html to represent multiple equations. 36 | - Only use the html tags math, i, b, p, and br. 37 | - Make sure to include all the equations in the image in the html output. 38 | - Make sure to include other text in the image in the correct positions along with the equations. 39 | 40 | **Instructions:** 41 | 1. Carefully examine the provided image. 42 | 2. Analyze the existing html, which may include LaTeX code. 43 | 3. Write a short analysis of how the html should be corrected to represent the image. 44 | 4. If the html and LaTeX are correct, write "No corrections needed." 45 | 5. If the html and LaTeX are incorrect, generate the corrected html. 46 | 6. Output only the analysis, then the corrected html or "No corrections needed." 47 | **Example:** 48 | Input: 49 | ```html 50 | The following equation illustrates the Pythagorean theorem: 51 | x2 + y2 = z2 52 | 53 | And this equation is a bit more complex: 54 | (ab * x5 + x2 + 2 * x + 123)/t 55 | ``` 56 | Output: 57 | analysis: The equations are not formatted as LaTeX, or enclosed in math tags. 58 | ```html 59 | <p>The following equation illustrates the Pythagorean theorem:</p> 60 | <math display="block">x^{2} + y^{2} = z^{2}</math> 61 | 62 | <p>And this equation is a bit more complex, and contains <math>ab \cdot x^{5}</math>:</p> 63 | <math display="block">\frac{ab \cdot x^{5} + x^{2} + 2 \cdot x + 123}{t}</math> 64 | ``` 65 | **Input:** 66 | ```html 67 | {equation} 68 | ``` 69 | """ 70 | 71 | def inference_blocks(self, document: Document) -> List[BlockData]: 72 | blocks = super().inference_blocks(document) 73 | out_blocks = [] 74 | for block_data in blocks: 75 | block = block_data["block"] 76 | page = block_data["page"] 77 | 78 | # If we redo inline math, we redo all equations 79 | if all([ 80 | block.polygon.height / page.polygon.height < self.min_equation_height, 81 | not self.redo_inline_math 82 | ]): 83 | continue 84 | out_blocks.append(block_data) 85 | return out_blocks 86 | 87 | def block_prompts(self, document: Document) -> List[PromptData]: 88 | prompt_data = [] 89 | for block_data in self.inference_blocks(document): 90 | block = block_data["block"] 91 | text = block.html if block.html else block.raw_text(document) 92 | prompt = self.equation_latex_prompt.replace("{equation}", text) 93 | image = self.extract_image(document, block) 94 | 95 | prompt_data.append({ 96 | "prompt": prompt, 97 | "image": image, 98 | "block": block, 99 | "schema": EquationSchema, 100 | "page": block_data["page"] 101 | }) 102 | 103 | return prompt_data 104 | 105 | 106 | def rewrite_block(self, response: dict, prompt_data: PromptData, document: Document): 107 | block = prompt_data["block"] 108 | text = block.html if block.html else block.raw_text(document) 109 | 110 | if not response or "corrected_equation" not in response: 111 | block.update_metadata(llm_error_count=1) 112 | return 113 | 114 | html_equation = response["corrected_equation"] 115 | 116 | if "no corrections needed" in html_equation.lower(): 117 | return 118 | 119 | balanced_tags = html_equation.count("<math") == html_equation.count("</math>") 120 | if not all([ 121 | html_equation, 122 | balanced_tags, 123 | len(html_equation) > len(text) * .3, 124 | ]): 125 | block.update_metadata(llm_error_count=1) 126 | return 127 | 128 | block.html = html_equation 129 | 130 | class EquationSchema(BaseModel): 131 | analysis: str 132 | corrected_equation: str ``` -------------------------------------------------------------------------------- /marker/builders/structure.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Annotated 2 | 3 | from marker.builders import BaseBuilder 4 | from marker.schema import BlockTypes 5 | from marker.schema.blocks import Text 6 | from marker.schema.document import Document 7 | from marker.schema.groups import ListGroup 8 | from marker.schema.groups.page import PageGroup 9 | from marker.schema.registry import get_block_class 10 | 11 | 12 | class StructureBuilder(BaseBuilder): 13 | """ 14 | A builder for grouping blocks together based on their structure. 15 | """ 16 | gap_threshold: Annotated[ 17 | float, 18 | "The minimum gap between blocks to consider them part of the same group.", 19 | ] = 0.05 20 | list_gap_threshold: Annotated[ 21 | float, 22 | "The minimum gap between list items to consider them part of the same group.", 23 | ] = 0.1 24 | 25 | def __init__(self, config=None): 26 | super().__init__(config) 27 | 28 | def __call__(self, document: Document): 29 | for page in document.pages: 30 | self.group_caption_blocks(page) 31 | self.group_lists(page) 32 | self.unmark_lists(page) 33 | 34 | def group_caption_blocks(self, page: PageGroup): 35 | gap_threshold_px = self.gap_threshold * page.polygon.height 36 | static_page_structure = page.structure.copy() 37 | remove_ids = list() 38 | 39 | for i, block_id in enumerate(static_page_structure): 40 | block = page.get_block(block_id) 41 | if block.block_type not in [BlockTypes.Table, BlockTypes.Figure, BlockTypes.Picture]: 42 | continue 43 | 44 | if block.id in remove_ids: 45 | continue 46 | 47 | block_structure = [block_id] 48 | selected_polygons = [block.polygon] 49 | caption_types = [BlockTypes.Caption, BlockTypes.Footnote] 50 | 51 | prev_block = page.get_prev_block(block) 52 | next_block = page.get_next_block(block) 53 | 54 | if prev_block and \ 55 | prev_block.block_type in caption_types and \ 56 | prev_block.polygon.minimum_gap(block.polygon) < gap_threshold_px and \ 57 | prev_block.id not in remove_ids: 58 | block_structure.insert(0, prev_block.id) 59 | selected_polygons.append(prev_block.polygon) 60 | 61 | if next_block and \ 62 | next_block.block_type in caption_types and \ 63 | next_block.polygon.minimum_gap(block.polygon) < gap_threshold_px: 64 | block_structure.append(next_block.id) 65 | selected_polygons.append(next_block.polygon) 66 | 67 | if len(block_structure) > 1: 68 | # Create a merged block 69 | new_block_cls = get_block_class(BlockTypes[block.block_type.name + "Group"]) 70 | new_polygon = block.polygon.merge(selected_polygons) 71 | group_block = page.add_block(new_block_cls, new_polygon) 72 | group_block.structure = block_structure 73 | 74 | # Update the structure of the page to reflect the new block 75 | page.update_structure_item(block_id, group_block.id) 76 | remove_ids.extend(block_structure) 77 | page.remove_structure_items(remove_ids) 78 | 79 | def group_lists(self, page: PageGroup): 80 | gap_threshold_px = self.list_gap_threshold * page.polygon.height 81 | static_page_structure = page.structure.copy() 82 | remove_ids = list() 83 | for i, block_id in enumerate(static_page_structure): 84 | block = page.get_block(block_id) 85 | if block.block_type not in [BlockTypes.ListItem]: 86 | continue 87 | 88 | if block.id in remove_ids: 89 | continue 90 | 91 | block_structure = [block_id] 92 | selected_polygons = [block.polygon] 93 | 94 | for j, next_block_id in enumerate(page.structure[i + 1:]): 95 | next_block = page.get_block(next_block_id) 96 | if all([ 97 | next_block.block_type == BlockTypes.ListItem, 98 | next_block.polygon.minimum_gap(selected_polygons[-1]) < gap_threshold_px 99 | ]): 100 | block_structure.append(next_block_id) 101 | selected_polygons.append(next_block.polygon) 102 | else: 103 | break 104 | 105 | if len(block_structure) > 1: 106 | new_polygon = block.polygon.merge(selected_polygons) 107 | group_block = page.add_block(ListGroup, new_polygon) 108 | group_block.structure = block_structure 109 | 110 | # Update the structure of the page to reflect the new block 111 | page.update_structure_item(block_id, group_block.id) 112 | remove_ids.extend(block_structure) 113 | 114 | page.remove_structure_items(remove_ids) 115 | 116 | def unmark_lists(self, page: PageGroup): 117 | # If lists aren't grouped, unmark them as list items 118 | for block_id in page.structure: 119 | block = page.get_block(block_id) 120 | if block.block_type == BlockTypes.ListItem: 121 | generated_block = Text( 122 | polygon=block.polygon, 123 | page_id=block.page_id, 124 | structure=block.structure, 125 | ) 126 | page.replace_block(block, generated_block) 127 | ``` -------------------------------------------------------------------------------- /marker/services/gemini.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import time 3 | import traceback 4 | from io import BytesIO 5 | from typing import List, Annotated 6 | 7 | import PIL 8 | from google import genai 9 | from google.genai import types 10 | from google.genai.errors import APIError 11 | from marker.logger import get_logger 12 | from pydantic import BaseModel 13 | 14 | from marker.schema.blocks import Block 15 | from marker.services import BaseService 16 | 17 | logger = get_logger() 18 | 19 | 20 | class BaseGeminiService(BaseService): 21 | gemini_model_name: Annotated[ 22 | str, "The name of the Google model to use for the service." 23 | ] = "gemini-2.0-flash" 24 | thinking_budget: Annotated[ 25 | int, "The thinking token budget to use for the service." 26 | ] = None 27 | 28 | def img_to_bytes(self, img: PIL.Image.Image): 29 | image_bytes = BytesIO() 30 | img.save(image_bytes, format="WEBP") 31 | return image_bytes.getvalue() 32 | 33 | def get_google_client(self, timeout: int): 34 | raise NotImplementedError 35 | 36 | def process_images(self, images): 37 | image_parts = [ 38 | types.Part.from_bytes(data=self.img_to_bytes(img), mime_type="image/webp") 39 | for img in images 40 | ] 41 | return image_parts 42 | 43 | def __call__( 44 | self, 45 | prompt: str, 46 | image: PIL.Image.Image | List[PIL.Image.Image] | None, 47 | block: Block | None, 48 | response_schema: type[BaseModel], 49 | max_retries: int | None = None, 50 | timeout: int | None = None, 51 | ): 52 | if max_retries is None: 53 | max_retries = self.max_retries 54 | 55 | if timeout is None: 56 | timeout = self.timeout 57 | 58 | client = self.get_google_client(timeout=timeout) 59 | image_parts = self.format_image_for_llm(image) 60 | 61 | total_tries = max_retries + 1 62 | temperature = 0 63 | for tries in range(1, total_tries + 1): 64 | config = { 65 | "temperature": temperature, 66 | "response_schema": response_schema, 67 | "response_mime_type": "application/json", 68 | } 69 | if self.max_output_tokens: 70 | config["max_output_tokens"] = self.max_output_tokens 71 | 72 | if self.thinking_budget is not None: 73 | # For gemini models, we can optionally set a thinking budget in the config 74 | config["thinking_config"] = types.ThinkingConfig( 75 | thinking_budget=self.thinking_budget 76 | ) 77 | 78 | try: 79 | responses = client.models.generate_content( 80 | model=self.gemini_model_name, 81 | contents=image_parts 82 | + [ 83 | prompt 84 | ], # According to gemini docs, it performs better if the image is the first element 85 | config=config, 86 | ) 87 | output = responses.candidates[0].content.parts[0].text 88 | total_tokens = responses.usage_metadata.total_token_count 89 | if block: 90 | block.update_metadata( 91 | llm_tokens_used=total_tokens, llm_request_count=1 92 | ) 93 | return json.loads(output) 94 | except APIError as e: 95 | if e.code in [429, 443, 503]: 96 | # Rate limit exceeded 97 | if tries == total_tries: 98 | # Last attempt failed. Give up 99 | logger.error( 100 | f"APIError: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})", 101 | ) 102 | break 103 | else: 104 | wait_time = tries * self.retry_wait_time 105 | logger.warning( 106 | f"APIError: {e}. Retrying in {wait_time} seconds... (Attempt {tries}/{total_tries})", 107 | ) 108 | time.sleep(wait_time) 109 | else: 110 | logger.error(f"APIError: {e}") 111 | break 112 | except json.JSONDecodeError as e: 113 | temperature = 0.2 # Increase temperature slightly to try and get a different respons 114 | 115 | # The response was not valid JSON 116 | if tries == total_tries: 117 | # Last attempt failed. Give up 118 | logger.error( 119 | f"JSONDecodeError: {e}. Max retries reached. Giving up. (Attempt {tries}/{total_tries})", 120 | ) 121 | break 122 | else: 123 | logger.warning( 124 | f"JSONDecodeError: {e}. Retrying... (Attempt {tries}/{total_tries})", 125 | ) 126 | except Exception as e: 127 | logger.error(f"Exception: {e}") 128 | traceback.print_exc() 129 | break 130 | 131 | return {} 132 | 133 | 134 | class GoogleGeminiService(BaseGeminiService): 135 | gemini_api_key: Annotated[str, "The Google API key to use for the service."] = None 136 | 137 | def get_google_client(self, timeout: int): 138 | return genai.Client( 139 | api_key=self.gemini_api_key, 140 | http_options={"timeout": timeout * 1000}, # Convert to milliseconds 141 | ) 142 | ``` -------------------------------------------------------------------------------- /marker/processors/line_numbers.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Annotated 2 | 3 | from marker.processors import BaseProcessor 4 | from marker.schema import BlockTypes 5 | from marker.schema.document import Document 6 | 7 | 8 | class LineNumbersProcessor(BaseProcessor): 9 | """ 10 | A processor for ignoring line numbers. 11 | """ 12 | block_types = (BlockTypes.Text, BlockTypes.TextInlineMath) 13 | strip_numbers_threshold: Annotated[ 14 | float, 15 | "The fraction of lines or tokens in a block that must be numeric to consider them as line numbers.", 16 | ] = 0.6 17 | min_lines_in_block: Annotated[ 18 | int, 19 | "The minimum number of lines required in a block for it to be considered during processing.", 20 | "Ensures that small blocks are ignored as they are unlikely to contain meaningful line numbers.", 21 | ] = 4 22 | min_line_length: Annotated[ 23 | int, 24 | "The minimum length of a line (in characters) to consider it significant when checking for", 25 | "numeric prefixes or suffixes. Prevents false positives for short lines.", 26 | ] = 10 27 | min_line_number_span_ratio: Annotated[ 28 | float, 29 | "The minimum ratio of detected line number spans to total lines required to treat them as line numbers.", 30 | ] = .6 31 | 32 | def __init__(self, config): 33 | super().__init__(config) 34 | 35 | def __call__(self, document: Document): 36 | self.ignore_line_number_spans(document) 37 | self.ignore_line_starts_ends(document) 38 | self.ignore_line_number_blocks(document) 39 | 40 | def ignore_line_number_spans(self, document: Document): 41 | for page in document.pages: 42 | line_count = 0 43 | line_number_spans = [] 44 | for block in page.contained_blocks(document, (BlockTypes.Line,)): 45 | if block.structure is None: 46 | continue 47 | 48 | line_count += 1 49 | leftmost_span = None 50 | for span in block.contained_blocks(document, (BlockTypes.Span,)): 51 | if leftmost_span is None or span.polygon.x_start < leftmost_span.polygon.x_start: 52 | leftmost_span = span 53 | 54 | if leftmost_span is not None and leftmost_span.text.strip().isnumeric(): 55 | line_number_spans.append(leftmost_span) 56 | 57 | if line_count > 0 and len(line_number_spans) / line_count > self.min_line_number_span_ratio: 58 | for span in line_number_spans: 59 | span.ignore_for_output = True 60 | 61 | def ignore_line_number_blocks(self, document: Document): 62 | for page in document.pages: 63 | for block in page.contained_blocks(document, self.block_types): 64 | raw_text = block.raw_text(document) 65 | tokens = raw_text.strip().split() 66 | if len(tokens) < 4: 67 | continue 68 | 69 | tokens_are_numbers = [token.isdigit() for token in tokens] 70 | if all([ 71 | sum(tokens_are_numbers) / len(tokens) > self.strip_numbers_threshold, 72 | block.polygon.height > block.polygon.width # Ensure block is taller than it is wide, like vertical page numbers 73 | ]): 74 | block.ignore_for_output = True 75 | 76 | def ignore_line_starts_ends(self, document: Document): 77 | for page in document.pages: 78 | for block in page.contained_blocks(document, self.block_types): 79 | if block.structure is None: 80 | continue 81 | 82 | all_lines = block.structure_blocks(document) 83 | if len(all_lines) < self.min_lines_in_block: 84 | continue 85 | 86 | starts_with_number = [] 87 | ends_with_number = [] 88 | for line in all_lines: 89 | spans = line.structure_blocks(document) 90 | if len(spans) < 2: 91 | starts_with_number.append(False) 92 | ends_with_number.append(False) 93 | continue 94 | 95 | raw_text = line.raw_text(document) 96 | starts = all([ 97 | spans[0].text.strip().isdigit(), 98 | len(raw_text) - len(spans[0].text.strip()) > self.min_line_length 99 | ]) 100 | 101 | ends = all([ 102 | spans[-1].text.strip().isdigit(), 103 | len(raw_text) - len(spans[-1].text.strip()) > self.min_line_length 104 | ]) 105 | 106 | starts_with_number.append(starts) 107 | ends_with_number.append(ends) 108 | 109 | if sum(starts_with_number) / len(starts_with_number) > self.strip_numbers_threshold: 110 | for starts, line in zip(starts_with_number, all_lines): 111 | if starts: 112 | span = page.get_block(line.structure[0]) 113 | span.ignore_for_output = True 114 | 115 | if sum(ends_with_number) / len(ends_with_number) > self.strip_numbers_threshold: 116 | for ends, line in zip(ends_with_number, all_lines): 117 | if ends: 118 | span = page.get_block(line.structure[-1]) 119 | span.ignore_for_output = True 120 | ``` -------------------------------------------------------------------------------- /marker/scripts/streamlit_app.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | 3 | from marker.scripts.common import ( 4 | load_models, 5 | parse_args, 6 | img_to_html, 7 | get_page_image, 8 | page_count, 9 | ) 10 | 11 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" 12 | os.environ["IN_STREAMLIT"] = "true" 13 | 14 | from marker.settings import settings 15 | from streamlit.runtime.uploaded_file_manager import UploadedFile 16 | 17 | import re 18 | import tempfile 19 | from typing import Any, Dict 20 | 21 | import streamlit as st 22 | from PIL import Image 23 | 24 | from marker.converters.pdf import PdfConverter 25 | from marker.config.parser import ConfigParser 26 | from marker.output import text_from_rendered 27 | 28 | 29 | def convert_pdf(fname: str, config_parser: ConfigParser) -> (str, Dict[str, Any], dict): 30 | config_dict = config_parser.generate_config_dict() 31 | config_dict["pdftext_workers"] = 1 32 | converter_cls = PdfConverter 33 | converter = converter_cls( 34 | config=config_dict, 35 | artifact_dict=model_dict, 36 | processor_list=config_parser.get_processors(), 37 | renderer=config_parser.get_renderer(), 38 | llm_service=config_parser.get_llm_service(), 39 | ) 40 | return converter(fname) 41 | 42 | 43 | def markdown_insert_images(markdown, images): 44 | image_tags = re.findall( 45 | r'(!\[(?P<image_title>[^\]]*)\]\((?P<image_path>[^\)"\s]+)\s*([^\)]*)\))', 46 | markdown, 47 | ) 48 | 49 | for image in image_tags: 50 | image_markdown = image[0] 51 | image_alt = image[1] 52 | image_path = image[2] 53 | if image_path in images: 54 | markdown = markdown.replace( 55 | image_markdown, img_to_html(images[image_path], image_alt) 56 | ) 57 | return markdown 58 | 59 | 60 | st.set_page_config(layout="wide") 61 | col1, col2 = st.columns([0.5, 0.5]) 62 | 63 | model_dict = load_models() 64 | cli_options = parse_args() 65 | 66 | st.markdown(""" 67 | # Marker Demo 68 | 69 | This app will let you try marker, a PDF or image -> Markdown, HTML, JSON converter. It works with any language, and extracts images, tables, equations, etc. 70 | 71 | Find the project [here](https://github.com/VikParuchuri/marker). 72 | """) 73 | 74 | in_file: UploadedFile = st.sidebar.file_uploader( 75 | "PDF, document, or image file:", 76 | type=["pdf", "png", "jpg", "jpeg", "gif", "pptx", "docx", "xlsx", "html", "epub"], 77 | ) 78 | 79 | if in_file is None: 80 | st.stop() 81 | 82 | filetype = in_file.type 83 | 84 | with col1: 85 | page_count = page_count(in_file) 86 | page_number = st.number_input( 87 | f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count 88 | ) 89 | pil_image = get_page_image(in_file, page_number) 90 | st.image(pil_image, use_container_width=True) 91 | 92 | page_range = st.sidebar.text_input( 93 | "Page range to parse, comma separated like 0,5-10,20", 94 | value=f"{page_number}-{page_number}", 95 | ) 96 | output_format = st.sidebar.selectbox( 97 | "Output format", ["markdown", "json", "html", "chunks"], index=0 98 | ) 99 | run_marker = st.sidebar.button("Run Marker") 100 | 101 | use_llm = st.sidebar.checkbox( 102 | "Use LLM", help="Use LLM for higher quality processing", value=False 103 | ) 104 | force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False) 105 | strip_existing_ocr = st.sidebar.checkbox( 106 | "Strip existing OCR", 107 | help="Strip existing OCR text from the PDF and re-OCR.", 108 | value=False, 109 | ) 110 | debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False) 111 | disable_ocr_math = st.sidebar.checkbox( 112 | "Disable math", 113 | help="Disable math in OCR output - no inline math", 114 | value=False, 115 | ) 116 | 117 | if not run_marker: 118 | st.stop() 119 | 120 | # Run Marker 121 | with tempfile.TemporaryDirectory() as tmp_dir: 122 | temp_pdf = os.path.join(tmp_dir, "temp.pdf") 123 | with open(temp_pdf, "wb") as f: 124 | f.write(in_file.getvalue()) 125 | 126 | cli_options.update( 127 | { 128 | "output_format": output_format, 129 | "page_range": page_range, 130 | "force_ocr": force_ocr, 131 | "debug": debug, 132 | "output_dir": settings.DEBUG_DATA_FOLDER if debug else None, 133 | "use_llm": use_llm, 134 | "strip_existing_ocr": strip_existing_ocr, 135 | "disable_ocr_math": disable_ocr_math, 136 | } 137 | ) 138 | config_parser = ConfigParser(cli_options) 139 | rendered = convert_pdf(temp_pdf, config_parser) 140 | page_range = config_parser.generate_config_dict()["page_range"] 141 | first_page = page_range[0] if page_range else 0 142 | 143 | text, ext, images = text_from_rendered(rendered) 144 | with col2: 145 | if output_format == "markdown": 146 | text = markdown_insert_images(text, images) 147 | st.markdown(text, unsafe_allow_html=True) 148 | elif output_format == "json": 149 | st.json(text) 150 | elif output_format == "html": 151 | st.html(text) 152 | elif output_format == "chunks": 153 | st.json(text) 154 | 155 | if debug: 156 | with col1: 157 | debug_data_path = rendered.metadata.get("debug_data_path") 158 | if debug_data_path: 159 | pdf_image_path = os.path.join(debug_data_path, f"pdf_page_{first_page}.png") 160 | img = Image.open(pdf_image_path) 161 | st.image(img, caption="PDF debug image", use_container_width=True) 162 | layout_image_path = os.path.join( 163 | debug_data_path, f"layout_page_{first_page}.png" 164 | ) 165 | img = Image.open(layout_image_path) 166 | st.image(img, caption="Layout debug image", use_container_width=True) 167 | st.write("Raw output:") 168 | st.code(text, language=output_format) 169 | ``` -------------------------------------------------------------------------------- /benchmarks/throughput/main.py: -------------------------------------------------------------------------------- ```python 1 | import os 2 | import tempfile 3 | import time 4 | from multiprocessing import get_context 5 | from concurrent.futures import ProcessPoolExecutor 6 | import torch 7 | 8 | import click 9 | import pypdfium2 as pdfium 10 | from tqdm import tqdm 11 | 12 | import datasets 13 | 14 | 15 | def get_next_pdf(ds: datasets.Dataset, i: int): 16 | while True: 17 | pdf = ds[i]["pdf"] 18 | filename = ds[i]["filename"] 19 | if pdf and filename.endswith(".pdf"): 20 | return pdf, filename, i + 1 21 | i += 1 22 | if i >= len(ds): 23 | i = 0 24 | 25 | 26 | def single_batch( 27 | batch_size: int, 28 | num_threads: int, 29 | force_ocr: bool, 30 | quantize: bool, 31 | compile: bool, 32 | worker_id: int, 33 | chunksize: int = 100, 34 | ): 35 | if quantize: 36 | os.environ["RECOGNITION_MODEL_QUANTIZE"] = "true" 37 | if compile: 38 | os.environ["COMPILE_ALL"] = "true" 39 | 40 | for item in [ 41 | "DETECTOR_POSTPROCESSING_CPU_WORKERS", 42 | "OPENBLAS_NUM_THREADS", 43 | "PDFTEXT_CPU_WORKERS", 44 | "OMP_NUM_THREADS", 45 | ]: 46 | os.environ[item] = f"{num_threads}" 47 | 48 | torch.set_num_threads(num_threads) 49 | 50 | from marker.converters.pdf import PdfConverter 51 | from marker.models import create_model_dict 52 | from marker.output import text_from_rendered 53 | 54 | ds = datasets.load_dataset("datalab-to/pdfs", split="train") 55 | model_dict = create_model_dict() 56 | torch.cuda.reset_peak_memory_stats() 57 | 58 | times = [] 59 | i = 0 60 | pages = 0 61 | chars = 0 62 | 63 | min_time = time.time() 64 | for _ in range(batch_size): 65 | pdf, fname, i = get_next_pdf(ds, i) 66 | print(f"Inferencing {fname} on worker {worker_id}...") 67 | 68 | pdf_doc = pdfium.PdfDocument(pdf) 69 | page_count = len(pdf_doc) 70 | pdf_doc.close() 71 | pages += page_count 72 | 73 | with tempfile.NamedTemporaryFile(suffix=".pdf") as f: 74 | f.write(pdf) 75 | f.flush() 76 | page_range_chunks = list(range(0, page_count, chunksize)) 77 | for chunk_start in page_range_chunks: 78 | chunk_end = min(chunk_start + chunksize, page_count) 79 | page_range = list(range(chunk_start, chunk_end)) 80 | 81 | block_converter = PdfConverter( 82 | artifact_dict=model_dict, 83 | config={ 84 | "disable_tqdm": worker_id > 0, 85 | "page_range": page_range, 86 | "force_ocr": force_ocr, 87 | }, 88 | ) 89 | start = time.time() 90 | rendered = block_converter(f.name) 91 | markdown, _, _ = text_from_rendered(rendered) 92 | chars += len(markdown) 93 | 94 | total = time.time() - start 95 | times.append(total) 96 | 97 | max_gpu_vram = torch.cuda.max_memory_reserved() / 1024**3 98 | max_time = time.time() 99 | return sum(times), min_time, max_time, max_gpu_vram, pages, chars 100 | 101 | 102 | @click.command(help="Benchmark PDF to MD conversion throughput.") 103 | @click.option("--workers", default=1, help="Number of workers to use.") 104 | @click.option("--batch_size", default=1, help="Batch size for inference.") 105 | @click.option("--force_ocr", is_flag=True, help="Force OCR on all pages.") 106 | @click.option("--quantize", is_flag=True, help="Use quantized model.") 107 | @click.option("--compile", is_flag=True, help="Use compiled model.") 108 | def main( 109 | workers: int, 110 | batch_size: int, 111 | force_ocr: bool, 112 | quantize: bool, 113 | compile: bool, 114 | ): 115 | total_cpus = os.cpu_count() 116 | start = time.time() 117 | current_gpu_vram = torch.cuda.memory_reserved() / 1024**3 118 | with ProcessPoolExecutor( 119 | max_workers=workers, mp_context=get_context("spawn") 120 | ) as executor: 121 | cpus_per_worker = min(8, max(2, total_cpus // workers)) 122 | futures = [ 123 | executor.submit( 124 | single_batch, 125 | batch_size, 126 | cpus_per_worker, 127 | force_ocr, 128 | quantize, 129 | compile, 130 | i, 131 | ) 132 | for i in range(workers) 133 | ] 134 | all_times = [] 135 | min_time = None 136 | max_time = time.time() 137 | vrams = [] 138 | page_count = 0 139 | char_count = 0 140 | for future in tqdm(futures, desc="Running marker workers..."): 141 | times, min_time_worker, max_time_worker, max_vram, pages, chars = ( 142 | future.result() 143 | ) 144 | vrams.append(max_vram - current_gpu_vram) 145 | all_times.append(times) 146 | page_count += pages 147 | char_count += chars 148 | min_time = ( 149 | min(min_time_worker, min_time) 150 | if min_time is not None 151 | else min_time_worker 152 | ) 153 | max_time = max(max_time, max_time_worker) 154 | 155 | end = time.time() - start 156 | all_worker_time = max_time - min_time 157 | 158 | print(f"Average time per worker: {sum(all_times) / len(all_times)}") 159 | print(f"Max time per worker: {max(all_times)}") 160 | print(f"End to end time (counting model loading), all processes: {end}") 161 | print(f"End to end time (no model loading), all processes: {all_worker_time}") 162 | print(f"Total pages: {page_count}") 163 | print(f"Total characters: {char_count}") 164 | print(f"Time per page: {all_worker_time / page_count:.2f}") 165 | print(f"Characters per second: {char_count / all_worker_time:.2f}") 166 | print(f"Max GPU VRAM: {max(vrams):.2f} GB") 167 | print(f"Average GPU VRAM: {sum(vrams) / len(vrams):.2f} GB") 168 | 169 | 170 | if __name__ == "__main__": 171 | main() 172 | ``` -------------------------------------------------------------------------------- /marker/renderers/__init__.py: -------------------------------------------------------------------------------- ```python 1 | import base64 2 | import io 3 | import re 4 | from collections import Counter 5 | from typing import Annotated, Optional, Tuple, Literal 6 | 7 | from bs4 import BeautifulSoup 8 | from pydantic import BaseModel 9 | 10 | from marker.schema import BlockTypes 11 | from marker.schema.blocks.base import BlockId, BlockOutput 12 | from marker.schema.document import Document 13 | from marker.settings import settings 14 | from marker.util import assign_config 15 | 16 | 17 | class BaseRenderer: 18 | image_blocks: Annotated[ 19 | Tuple[BlockTypes, ...], "The block types to consider as images." 20 | ] = (BlockTypes.Picture, BlockTypes.Figure) 21 | extract_images: Annotated[bool, "Extract images from the document."] = True 22 | image_extraction_mode: Annotated[ 23 | Literal["lowres", "highres"], 24 | "The mode to use for extracting images.", 25 | ] = "highres" 26 | keep_pageheader_in_output: Annotated[ 27 | bool, "Keep the page header in the output HTML." 28 | ] = False 29 | keep_pagefooter_in_output: Annotated[ 30 | bool, "Keep the page footer in the output HTML." 31 | ] = False 32 | add_block_ids: Annotated[bool, "Whether to add block IDs to the output HTML."] = ( 33 | False 34 | ) 35 | 36 | def __init__(self, config: Optional[BaseModel | dict] = None): 37 | assign_config(self, config) 38 | 39 | self.block_config = { 40 | "keep_pageheader_in_output": self.keep_pageheader_in_output, 41 | "keep_pagefooter_in_output": self.keep_pagefooter_in_output, 42 | "add_block_ids": self.add_block_ids, 43 | } 44 | 45 | def __call__(self, document): 46 | # Children are in reading order 47 | raise NotImplementedError 48 | 49 | def extract_image(self, document: Document, image_id, to_base64=False): 50 | image_block = document.get_block(image_id) 51 | cropped = image_block.get_image( 52 | document, highres=self.image_extraction_mode == "highres" 53 | ) 54 | 55 | if to_base64: 56 | image_buffer = io.BytesIO() 57 | # RGBA to RGB 58 | if not cropped.mode == "RGB": 59 | cropped = cropped.convert("RGB") 60 | 61 | cropped.save(image_buffer, format=settings.OUTPUT_IMAGE_FORMAT) 62 | cropped = base64.b64encode(image_buffer.getvalue()).decode( 63 | settings.OUTPUT_ENCODING 64 | ) 65 | return cropped 66 | 67 | @staticmethod 68 | def merge_consecutive_math(html, tag="math"): 69 | if not html: 70 | return html 71 | pattern = rf"-</{tag}>(\s*)<{tag}>" 72 | html = re.sub(pattern, " ", html) 73 | 74 | pattern = rf'-</{tag}>(\s*)<{tag} display="inline">' 75 | html = re.sub(pattern, " ", html) 76 | return html 77 | 78 | @staticmethod 79 | def merge_consecutive_tags(html, tag): 80 | if not html: 81 | return html 82 | 83 | def replace_whitespace(match): 84 | whitespace = match.group(1) 85 | if len(whitespace) == 0: 86 | return "" 87 | else: 88 | return " " 89 | 90 | pattern = rf"</{tag}>(\s*)<{tag}>" 91 | 92 | while True: 93 | new_merged = re.sub(pattern, replace_whitespace, html) 94 | if new_merged == html: 95 | break 96 | html = new_merged 97 | 98 | return html 99 | 100 | def generate_page_stats(self, document: Document, document_output): 101 | page_stats = [] 102 | for page in document.pages: 103 | block_counts = Counter( 104 | [str(block.block_type) for block in page.children] 105 | ).most_common() 106 | block_metadata = page.aggregate_block_metadata() 107 | page_stats.append( 108 | { 109 | "page_id": page.page_id, 110 | "text_extraction_method": page.text_extraction_method, 111 | "block_counts": block_counts, 112 | "block_metadata": block_metadata.model_dump(), 113 | } 114 | ) 115 | return page_stats 116 | 117 | def generate_document_metadata(self, document: Document, document_output): 118 | metadata = { 119 | "table_of_contents": document.table_of_contents, 120 | "page_stats": self.generate_page_stats(document, document_output), 121 | } 122 | if document.debug_data_path is not None: 123 | metadata["debug_data_path"] = document.debug_data_path 124 | 125 | return metadata 126 | 127 | def extract_block_html(self, document: Document, block_output: BlockOutput): 128 | soup = BeautifulSoup(block_output.html, "html.parser") 129 | 130 | content_refs = soup.find_all("content-ref") 131 | ref_block_id = None 132 | images = {} 133 | for ref in content_refs: 134 | src = ref.get("src") 135 | sub_images = {} 136 | for item in block_output.children: 137 | if item.id == src: 138 | content, sub_images_ = self.extract_block_html(document, item) 139 | sub_images.update(sub_images_) 140 | ref_block_id: BlockId = item.id 141 | break 142 | 143 | if ref_block_id.block_type in self.image_blocks and self.extract_images: 144 | images[ref_block_id] = self.extract_image( 145 | document, ref_block_id, to_base64=True 146 | ) 147 | else: 148 | images.update(sub_images) 149 | ref.replace_with(BeautifulSoup(content, "html.parser")) 150 | 151 | if block_output.id.block_type in self.image_blocks and self.extract_images: 152 | images[block_output.id] = self.extract_image( 153 | document, block_output.id, to_base64=True 154 | ) 155 | 156 | return str(soup), images 157 | ``` -------------------------------------------------------------------------------- /marker/processors/llm/llm_sectionheader.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | from typing import List, Tuple 3 | 4 | from tqdm import tqdm 5 | 6 | from marker.logger import get_logger 7 | from marker.processors.llm import BaseLLMComplexBlockProcessor 8 | from marker.schema import BlockTypes 9 | from marker.schema.blocks import Block 10 | from marker.schema.document import Document 11 | from marker.schema.groups import PageGroup 12 | from pydantic import BaseModel 13 | 14 | logger = get_logger() 15 | 16 | 17 | class LLMSectionHeaderProcessor(BaseLLMComplexBlockProcessor): 18 | page_prompt = """You're a text correction expert specializing in accurately analyzing complex PDF documents. You will be given a list of all of the section headers from a document, along with their page number and approximate dimensions. The headers will be formatted like below, and will be presented in order. 19 | 20 | ```json 21 | [ 22 | { 23 | "bbox": [x1, y1, x2, y2], 24 | "width": x2 - x1, 25 | "height": y2 - y1, 26 | "page": 0, 27 | "id": "/page/0/SectionHeader/1", 28 | "html": "<h1>Introduction</h1>", 29 | }, ... 30 | ] 31 | ``` 32 | 33 | Bboxes have been normalized to 0-1000. 34 | 35 | Your goal is to make sure that the section headers have the correct levels (h1, h2, h3, h4, h5, or h6). If a section header does not have the right level, edit the html to fix it. 36 | 37 | Guidelines: 38 | - Edit the blocks to ensure that the section headers have the correct levels. 39 | - Only edit the h1, h2, h3, h4, h5, and h6 tags. Do not change any other tags or content in the headers. 40 | - Only output the headers that changed (if nothing changed, output nothing). 41 | - Every header you output needs to have one and only one level tag (h1, h2, h3, h4, h5, or h6). 42 | 43 | **Instructions:** 44 | 1. Carefully examine the provided section headers and JSON. 45 | 2. Identify any changes you'll need to make, and write a short analysis. 46 | 3. Output "no_corrections", or "corrections_needed", depending on whether you need to make changes. 47 | 4. If corrections are needed, output any blocks that need updates. Only output the block ids and html, like this: 48 | ```json 49 | [ 50 | { 51 | "id": "/page/0/SectionHeader/1", 52 | "html": "<h2>Introduction</h2>" 53 | }, 54 | ... 55 | ] 56 | ``` 57 | 58 | **Example:** 59 | Input: 60 | Section Headers 61 | ```json 62 | [ 63 | { 64 | "bbox": [x1, y1, x2, y2], 65 | "id": "/page/0/SectionHeader/1", 66 | "page": 0, 67 | "html": "1 Vector Operations", 68 | }, 69 | { 70 | "bbox": [x1, y1, x2, y2], 71 | "id": "/page/0/SectionHeader/2", 72 | "page": 0, 73 | "html": "1.1 Vector Addition", 74 | }, 75 | ] 76 | ``` 77 | Output: 78 | Analysis: The first section header is missing the h1 tag, and the second section header is missing the h2 tag. 79 | ```json 80 | [ 81 | { 82 | "id": "/page/0/SectionHeader/1", 83 | "html": "<h1>1 Vector Operations</h1>" 84 | }, 85 | { 86 | "id": "/page/0/SectionHeader/2", 87 | "html": "<h2>1.1 Vector Addition</h2>" 88 | } 89 | ] 90 | ``` 91 | 92 | **Input:** 93 | Section Headers 94 | ```json 95 | {{section_header_json}} 96 | ``` 97 | """ 98 | 99 | def get_selected_blocks( 100 | self, 101 | document: Document, 102 | page: PageGroup, 103 | ) -> List[dict]: 104 | selected_blocks = page.structure_blocks(document) 105 | json_blocks = [ 106 | self.normalize_block_json(block, document, page, i) 107 | for i, block in enumerate(selected_blocks) 108 | ] 109 | return json_blocks 110 | 111 | def process_rewriting( 112 | self, document: Document, section_headers: List[Tuple[Block, dict]] 113 | ): 114 | section_header_json = [sh[1] for sh in section_headers] 115 | for item in section_header_json: 116 | _, _, page_id, block_type, block_id = item["id"].split("/") 117 | item["page"] = page_id 118 | item["width"] = item["bbox"][2] - item["bbox"][0] 119 | item["height"] = item["bbox"][3] - item["bbox"][1] 120 | del item["block_type"] # Not needed, since they're all section headers 121 | 122 | prompt = self.page_prompt.replace( 123 | "{{section_header_json}}", json.dumps(section_header_json) 124 | ) 125 | response = self.llm_service( 126 | prompt, None, document.pages[0], SectionHeaderSchema 127 | ) 128 | logger.debug(f"Got section header reponse from LLM: {response}") 129 | 130 | if not response or "correction_type" not in response: 131 | logger.warning("LLM did not return a valid response") 132 | return 133 | 134 | correction_type = response["correction_type"] 135 | if correction_type == "no_corrections": 136 | return 137 | 138 | self.load_blocks(response) 139 | self.handle_rewrites(response["blocks"], document) 140 | 141 | def load_blocks(self, response): 142 | if isinstance(response["blocks"], str): 143 | response["blocks"] = json.loads(response["blocks"]) 144 | 145 | def rewrite_blocks(self, document: Document): 146 | # Don't show progress if there are no blocks to process 147 | section_headers = [ 148 | (block, self.normalize_block_json(block, document, page)) 149 | for page in document.pages 150 | for block in page.structure_blocks(document) 151 | if block.block_type == BlockTypes.SectionHeader 152 | ] 153 | if len(section_headers) == 0: 154 | return 155 | 156 | pbar = tqdm( 157 | total=1, 158 | desc=f"Running {self.__class__.__name__}", 159 | disable=self.disable_tqdm, 160 | ) 161 | 162 | self.process_rewriting(document, section_headers) 163 | pbar.update(1) 164 | pbar.close() 165 | 166 | 167 | class BlockSchema(BaseModel): 168 | id: str 169 | html: str 170 | 171 | 172 | class SectionHeaderSchema(BaseModel): 173 | analysis: str 174 | correction_type: str 175 | blocks: List[BlockSchema] 176 | ``` -------------------------------------------------------------------------------- /marker/renderers/html.py: -------------------------------------------------------------------------------- ```python 1 | import textwrap 2 | 3 | from PIL import Image 4 | from typing import Annotated, Tuple 5 | 6 | from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning 7 | from pydantic import BaseModel 8 | 9 | from marker.renderers import BaseRenderer 10 | from marker.schema import BlockTypes 11 | from marker.schema.blocks import BlockId 12 | from marker.settings import settings 13 | 14 | # Ignore beautifulsoup warnings 15 | import warnings 16 | 17 | warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning) 18 | 19 | # Suppress DecompressionBombError 20 | Image.MAX_IMAGE_PIXELS = None 21 | 22 | 23 | class HTMLOutput(BaseModel): 24 | html: str 25 | images: dict 26 | metadata: dict 27 | 28 | 29 | class HTMLRenderer(BaseRenderer): 30 | """ 31 | A renderer for HTML output. 32 | """ 33 | 34 | page_blocks: Annotated[ 35 | Tuple[BlockTypes], 36 | "The block types to consider as pages.", 37 | ] = (BlockTypes.Page,) 38 | paginate_output: Annotated[ 39 | bool, 40 | "Whether to paginate the output.", 41 | ] = False 42 | 43 | def extract_image(self, document, image_id): 44 | image_block = document.get_block(image_id) 45 | cropped = image_block.get_image( 46 | document, highres=self.image_extraction_mode == "highres" 47 | ) 48 | return cropped 49 | 50 | def insert_block_id(self, soup, block_id: BlockId): 51 | """ 52 | Insert a block ID into the soup as a data attribute. 53 | """ 54 | if block_id.block_type in [BlockTypes.Line, BlockTypes.Span]: 55 | return soup 56 | 57 | if self.add_block_ids: 58 | # Find the outermost tag (first tag that isn't a NavigableString) 59 | outermost_tag = None 60 | for element in soup.contents: 61 | if hasattr(element, "name") and element.name: 62 | outermost_tag = element 63 | break 64 | 65 | # If we found an outermost tag, add the data-block-id attribute 66 | if outermost_tag: 67 | outermost_tag["data-block-id"] = str(block_id) 68 | 69 | # If soup only contains text or no tags, wrap in a span 70 | elif soup.contents: 71 | wrapper = soup.new_tag("span") 72 | wrapper["data-block-id"] = str(block_id) 73 | 74 | contents = list(soup.contents) 75 | for content in contents: 76 | content.extract() 77 | wrapper.append(content) 78 | soup.append(wrapper) 79 | return soup 80 | 81 | def extract_html(self, document, document_output, level=0): 82 | soup = BeautifulSoup(document_output.html, "html.parser") 83 | 84 | content_refs = soup.find_all("content-ref") 85 | ref_block_id = None 86 | images = {} 87 | for ref in content_refs: 88 | src = ref.get("src") 89 | sub_images = {} 90 | content = "" 91 | for item in document_output.children: 92 | if item.id == src: 93 | content, sub_images_ = self.extract_html(document, item, level + 1) 94 | sub_images.update(sub_images_) 95 | ref_block_id: BlockId = item.id 96 | break 97 | 98 | if ref_block_id.block_type in self.image_blocks: 99 | if self.extract_images: 100 | image = self.extract_image(document, ref_block_id) 101 | image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}" 102 | images[image_name] = image 103 | element = BeautifulSoup( 104 | f"<p>{content}<img src='{image_name}'></p>", "html.parser" 105 | ) 106 | ref.replace_with(self.insert_block_id(element, ref_block_id)) 107 | else: 108 | # This will be the image description if using llm mode, or empty if not 109 | element = BeautifulSoup(f"{content}", "html.parser") 110 | ref.replace_with(self.insert_block_id(element, ref_block_id)) 111 | elif ref_block_id.block_type in self.page_blocks: 112 | images.update(sub_images) 113 | if self.paginate_output: 114 | content = f"<div class='page' data-page-id='{ref_block_id.page_id}'>{content}</div>" 115 | element = BeautifulSoup(f"{content}", "html.parser") 116 | ref.replace_with(self.insert_block_id(element, ref_block_id)) 117 | else: 118 | images.update(sub_images) 119 | element = BeautifulSoup(f"{content}", "html.parser") 120 | ref.replace_with(self.insert_block_id(element, ref_block_id)) 121 | 122 | output = str(soup) 123 | if level == 0: 124 | output = self.merge_consecutive_tags(output, "b") 125 | output = self.merge_consecutive_tags(output, "i") 126 | output = self.merge_consecutive_math( 127 | output 128 | ) # Merge consecutive inline math tags 129 | output = textwrap.dedent(f""" 130 | <!DOCTYPE html> 131 | <html> 132 | <head> 133 | <meta charset="utf-8" /> 134 | </head> 135 | <body> 136 | {output} 137 | </body> 138 | </html> 139 | """) 140 | 141 | return output, images 142 | 143 | def __call__(self, document) -> HTMLOutput: 144 | document_output = document.render(self.block_config) 145 | full_html, images = self.extract_html(document, document_output) 146 | soup = BeautifulSoup(full_html, "html.parser") 147 | full_html = soup.prettify() # Add indentation to the HTML 148 | return HTMLOutput( 149 | html=full_html, 150 | images=images, 151 | metadata=self.generate_document_metadata(document, document_output), 152 | ) 153 | ``` -------------------------------------------------------------------------------- /marker/extractors/page.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | from concurrent.futures import ThreadPoolExecutor 3 | 4 | from pydantic import BaseModel 5 | from typing import Annotated, Optional, List 6 | 7 | from tqdm import tqdm 8 | 9 | from marker.extractors import BaseExtractor 10 | from marker.logger import get_logger 11 | 12 | logger = get_logger() 13 | 14 | 15 | class PageExtractionSchema(BaseModel): 16 | description: str 17 | detailed_notes: str 18 | 19 | 20 | class PageExtractor(BaseExtractor): 21 | """ 22 | An extractor that pulls data from a single page. 23 | """ 24 | 25 | extraction_page_chunk_size: Annotated[ 26 | int, "The number of pages to chunk together for extraction." 27 | ] = 3 28 | 29 | page_schema: Annotated[ 30 | str, 31 | "The JSON schema to be extracted from the page.", 32 | ] = "" 33 | 34 | page_extraction_prompt = """You are an expert document analyst who reads documents and pulls data out in JSON format. You will receive the markdown representation of a document page, and a JSON schema that we want to extract from the document. Your task is to write detailed notes on this page, so that when you look at all your notes from across the document, you can fill in the schema. 35 | 36 | Some notes: 37 | - The schema may contain a single object to extract from the entire document, or an array of objects. 38 | - The schema may contain nested objects, arrays, and other complex structures. 39 | 40 | Some guidelines: 41 | - Write very thorough notes, and include specific JSON snippets that can be extracted from the page. 42 | - You may need information from prior or subsequent pages to fully fill in the schema, so make sure to write detailed notes that will let you join entities across pages later on. 43 | - Estimate your confidence in the values you extract, so you can reconstruct the JSON later when you only have your notes. 44 | - Some tables and other data structures may continue on a subsequent page, so make sure to store the positions that data comes from where appropriate. 45 | 46 | **Instructions:** 47 | 1. Analyze the provided markdown representation of the page. 48 | 2. Analyze the JSON schema. 49 | 3. Write a short description of the fields in the schema, and the associated values in the markdown. 50 | 4. Write detailed notes on the page, including any values that can be extracted from the markdown. Include snippets of JSON that can be extracted from the page where possible. 51 | 52 | **Example:** 53 | Input: 54 | 55 | Markdown 56 | ```markdown 57 | | Make | Sales | 58 | |--------|-------| 59 | | Honda | 100 | 60 | | Toyota | 200 | 61 | ``` 62 | 63 | Schema 64 | 65 | ```json 66 | {'$defs': {'Cars': {'properties': {'make': {'title': 'Make', 'type': 'string'}, 'sales': {'title': 'Sales', 'type': 'integer'}, 'color': {'title': 'Color', 'type': 'string'}}, 'required': ['make', 'sales', 'color'], 'title': 'Cars', 'type': 'object'}}, 'properties': {'cars': {'items': {'$ref': '#/$defs/Cars'}, 'title': 'Cars', 'type': 'array'}}, 'required': ['cars'], 'title': 'CarsList', 'type': 'object'} 67 | ``` 68 | 69 | Output: 70 | 71 | Description: The schema has a list of cars, each with a make, sales, and color. The image and markdown contain a table with 2 cars: Honda with 100 sales and Toyota with 200 sales. The color is not present in the table. 72 | Detailed Notes: On this page, I see a table with car makes and sales. The makes are Honda and Toyota, with sales of 100 and 200 respectively. The color is not present in the table, so I will leave it blank in the JSON. That information may be present on another page. Some JSON snippets I may find useful later are: 73 | ```json 74 | { 75 | "make": "Honda", 76 | "sales": 100, 77 | } 78 | ``` 79 | ```json 80 | { 81 | "make": "Toyota", 82 | "sales": 200, 83 | } 84 | ``` 85 | 86 | Honda is the first row in the table, and Toyota is the second row. Make is the first column, and sales is the second. 87 | 88 | **Input:** 89 | 90 | Markdown 91 | ```markdown 92 | {{page_md}} 93 | ``` 94 | 95 | Schema 96 | ```json 97 | {{schema}} 98 | ``` 99 | """ 100 | 101 | def chunk_page_markdown(self, page_markdown: List[str]) -> List[str]: 102 | """ 103 | Chunk the page markdown into smaller pieces for processing. 104 | """ 105 | 106 | chunks = [] 107 | for i in range(0, len(page_markdown), self.extraction_page_chunk_size): 108 | chunk = page_markdown[i : i + self.extraction_page_chunk_size] 109 | chunks.append("\n\n".join(chunk)) 110 | 111 | return chunks 112 | 113 | def inference_single_chunk( 114 | self, page_markdown: str 115 | ) -> Optional[PageExtractionSchema]: 116 | prompt = self.page_extraction_prompt.replace( 117 | "{{page_md}}", page_markdown 118 | ).replace("{{schema}}", json.dumps(self.page_schema)) 119 | response = self.llm_service(prompt, None, None, PageExtractionSchema) 120 | logger.debug(f"Page extraction response: {response}") 121 | 122 | if not response or any( 123 | [ 124 | key not in response 125 | for key in [ 126 | "description", 127 | "detailed_notes", 128 | ] 129 | ] 130 | ): 131 | return None 132 | 133 | return PageExtractionSchema( 134 | description=response["description"], 135 | detailed_notes=response["detailed_notes"], 136 | ) 137 | 138 | def __call__( 139 | self, 140 | page_markdown: List[str], 141 | **kwargs, 142 | ) -> List[PageExtractionSchema]: 143 | if not self.page_schema: 144 | raise ValueError( 145 | "Page schema must be defined for structured extraction to work." 146 | ) 147 | 148 | chunks = self.chunk_page_markdown(page_markdown) 149 | results = [] 150 | pbar = tqdm( 151 | desc="Running page extraction", 152 | disable=self.disable_tqdm, 153 | total=len(chunks), 154 | ) 155 | 156 | with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor: 157 | for future in [ 158 | executor.submit(self.inference_single_chunk, chunk) for chunk in chunks 159 | ]: 160 | results.append(future.result()) # Raise exceptions if any occurred 161 | pbar.update(1) 162 | 163 | pbar.close() 164 | return results 165 | ``` -------------------------------------------------------------------------------- /marker/scripts/common.py: -------------------------------------------------------------------------------- ```python 1 | import ast 2 | import base64 3 | import io 4 | import re 5 | import sys 6 | from typing import Optional 7 | 8 | from PIL import Image 9 | import click 10 | import pypdfium2 11 | import streamlit as st 12 | from pydantic import BaseModel 13 | from streamlit.runtime.uploaded_file_manager import UploadedFile 14 | 15 | from marker.config.parser import ConfigParser 16 | from marker.config.printer import CustomClickPrinter 17 | from marker.models import create_model_dict 18 | from marker.settings import settings 19 | 20 | 21 | @st.cache_data() 22 | def parse_args(): 23 | # Use to grab common cli options 24 | @ConfigParser.common_options 25 | def options_func(): 26 | pass 27 | 28 | def extract_click_params(decorated_function): 29 | if hasattr(decorated_function, "__click_params__"): 30 | return decorated_function.__click_params__ 31 | return [] 32 | 33 | cmd = CustomClickPrinter("Marker app.") 34 | extracted_params = extract_click_params(options_func) 35 | cmd.params.extend(extracted_params) 36 | ctx = click.Context(cmd) 37 | try: 38 | cmd_args = sys.argv[1:] 39 | cmd.parse_args(ctx, cmd_args) 40 | return ctx.params 41 | except click.exceptions.ClickException as e: 42 | return {"error": str(e)} 43 | 44 | 45 | @st.cache_resource() 46 | def load_models(): 47 | return create_model_dict() 48 | 49 | 50 | def open_pdf(pdf_file): 51 | stream = io.BytesIO(pdf_file.getvalue()) 52 | return pypdfium2.PdfDocument(stream) 53 | 54 | 55 | def img_to_html(img, img_alt): 56 | img_bytes = io.BytesIO() 57 | img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT) 58 | img_bytes = img_bytes.getvalue() 59 | encoded = base64.b64encode(img_bytes).decode() 60 | img_html = f'<img src="data:image/{settings.OUTPUT_IMAGE_FORMAT.lower()};base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">' 61 | return img_html 62 | 63 | 64 | @st.cache_data() 65 | def get_page_image(pdf_file, page_num, dpi=96): 66 | if "pdf" in pdf_file.type: 67 | doc = open_pdf(pdf_file) 68 | page = doc[page_num] 69 | png_image = ( 70 | page.render( 71 | scale=dpi / 72, 72 | ) 73 | .to_pil() 74 | .convert("RGB") 75 | ) 76 | else: 77 | png_image = Image.open(pdf_file).convert("RGB") 78 | return png_image 79 | 80 | 81 | @st.cache_data() 82 | def page_count(pdf_file: UploadedFile): 83 | if "pdf" in pdf_file.type: 84 | doc = open_pdf(pdf_file) 85 | return len(doc) - 1 86 | else: 87 | return 1 88 | 89 | 90 | def pillow_image_to_base64_string(img: Image) -> str: 91 | buffered = io.BytesIO() 92 | img.save(buffered, format="JPEG") 93 | return base64.b64encode(buffered.getvalue()).decode("utf-8") 94 | 95 | 96 | def extract_root_pydantic_class(schema_code: str) -> Optional[str]: 97 | try: 98 | # Parse the code into an AST 99 | tree = ast.parse(schema_code) 100 | 101 | # Find all class definitions that inherit from BaseModel 102 | class_names = set() 103 | class_info = {} # Store information about each class 104 | 105 | for node in ast.walk(tree): 106 | if isinstance(node, ast.ClassDef): 107 | # Check if this class inherits from BaseModel 108 | is_pydantic = False 109 | for base in node.bases: 110 | if isinstance(base, ast.Name) and base.id == "BaseModel": 111 | is_pydantic = True 112 | break 113 | 114 | if is_pydantic: 115 | class_names.add(node.name) 116 | class_info[node.name] = { 117 | "references": set(), # Classes this class references 118 | "fields": [], # Field names in this class 119 | } 120 | 121 | # Extract field information 122 | for item in node.body: 123 | if isinstance(item, ast.AnnAssign) and isinstance( 124 | item.target, ast.Name 125 | ): 126 | field_name = item.target.id 127 | class_info[node.name]["fields"].append(field_name) 128 | 129 | # Check if this field references another class 130 | annotation_str = ast.unparse(item.annotation) 131 | 132 | # Look for List[ClassName], Optional[ClassName], Dict[Any, ClassName], etc. 133 | for other_class in class_names: 134 | pattern = rf"(?:List|Dict|Set|Tuple|Optional|Union)?\[.*{other_class}.*\]|{other_class}" 135 | if re.search(pattern, annotation_str): 136 | class_info[node.name]["references"].add(other_class) 137 | 138 | if len(class_names) == 1: 139 | return list(class_names)[0] 140 | 141 | referenced_classes = set() 142 | for class_name, info in class_info.items(): 143 | referenced_classes.update(info["references"]) 144 | 145 | # Find classes that reference others but aren't referenced themselves (potential roots) 146 | root_candidates = set() 147 | for class_name, info in class_info.items(): 148 | if info["references"] and class_name not in referenced_classes: 149 | root_candidates.add(class_name) 150 | 151 | # If we found exactly one root candidate, return it 152 | if len(root_candidates) == 1: 153 | return list(root_candidates)[0] 154 | 155 | return None 156 | except Exception as e: 157 | print(f"Error parsing schema: {e}") 158 | return None 159 | 160 | 161 | def get_root_class(schema_code: str) -> Optional[BaseModel]: 162 | root_class_name = extract_root_pydantic_class(schema_code) 163 | 164 | if not root_class_name: 165 | return None 166 | 167 | if "from pydantic" not in schema_code: 168 | schema_code = "from pydantic import BaseModel\n" + schema_code 169 | if "from typing" not in schema_code: 170 | schema_code = ( 171 | "from typing import List, Dict, Optional, Set, Tuple, Union, Any\n\n" 172 | + schema_code 173 | ) 174 | 175 | # Execute the code in a new namespace 176 | namespace = {} 177 | exec(schema_code, namespace) 178 | 179 | # Return the root class object 180 | return namespace.get(root_class_name) 181 | ``` -------------------------------------------------------------------------------- /marker/config/parser.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import os 3 | from typing import Dict 4 | 5 | import click 6 | 7 | from marker.converters.pdf import PdfConverter 8 | from marker.logger import get_logger 9 | from marker.renderers.chunk import ChunkRenderer 10 | from marker.renderers.html import HTMLRenderer 11 | from marker.renderers.json import JSONRenderer 12 | from marker.renderers.markdown import MarkdownRenderer 13 | from marker.settings import settings 14 | from marker.util import classes_to_strings, parse_range_str, strings_to_classes 15 | 16 | logger = get_logger() 17 | 18 | 19 | class ConfigParser: 20 | def __init__(self, cli_options: dict): 21 | self.cli_options = cli_options 22 | 23 | @staticmethod 24 | def common_options(fn): 25 | fn = click.option( 26 | "--output_dir", 27 | type=click.Path(exists=False), 28 | required=False, 29 | default=settings.OUTPUT_DIR, 30 | help="Directory to save output.", 31 | )(fn) 32 | fn = click.option("--debug", "-d", is_flag=True, help="Enable debug mode.")(fn) 33 | fn = click.option( 34 | "--output_format", 35 | type=click.Choice(["markdown", "json", "html", "chunks"]), 36 | default="markdown", 37 | help="Format to output results in.", 38 | )(fn) 39 | fn = click.option( 40 | "--processors", 41 | type=str, 42 | default=None, 43 | help="Comma separated list of processors to use. Must use full module path.", 44 | )(fn) 45 | fn = click.option( 46 | "--config_json", 47 | type=str, 48 | default=None, 49 | help="Path to JSON file with additional configuration.", 50 | )(fn) 51 | fn = click.option( 52 | "--disable_multiprocessing", 53 | is_flag=True, 54 | default=False, 55 | help="Disable multiprocessing.", 56 | )(fn) 57 | fn = click.option( 58 | "--disable_image_extraction", 59 | is_flag=True, 60 | default=False, 61 | help="Disable image extraction.", 62 | )(fn) 63 | # these are options that need a list transformation, i.e splitting/parsing a string 64 | fn = click.option( 65 | "--page_range", 66 | type=str, 67 | default=None, 68 | help="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20", 69 | )(fn) 70 | 71 | # we put common options here 72 | fn = click.option( 73 | "--converter_cls", 74 | type=str, 75 | default=None, 76 | help="Converter class to use. Defaults to PDF converter.", 77 | )(fn) 78 | fn = click.option( 79 | "--llm_service", 80 | type=str, 81 | default=None, 82 | help="LLM service to use - should be full import path, like marker.services.gemini.GoogleGeminiService", 83 | )(fn) 84 | return fn 85 | 86 | def generate_config_dict(self) -> Dict[str, any]: 87 | config = {} 88 | output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR) 89 | for k, v in self.cli_options.items(): 90 | if not v: 91 | continue 92 | 93 | match k: 94 | case "debug": 95 | config["debug_pdf_images"] = True 96 | config["debug_layout_images"] = True 97 | config["debug_json"] = True 98 | config["debug_data_folder"] = output_dir 99 | case "page_range": 100 | config["page_range"] = parse_range_str(v) 101 | case "config_json": 102 | with open(v, "r", encoding="utf-8") as f: 103 | config.update(json.load(f)) 104 | case "disable_multiprocessing": 105 | config["pdftext_workers"] = 1 106 | case "disable_image_extraction": 107 | config["extract_images"] = False 108 | case _: 109 | config[k] = v 110 | 111 | # Backward compatibility for google_api_key 112 | if settings.GOOGLE_API_KEY: 113 | config["gemini_api_key"] = settings.GOOGLE_API_KEY 114 | 115 | return config 116 | 117 | def get_llm_service(self): 118 | # Only return an LLM service when use_llm is enabled 119 | if not self.cli_options.get("use_llm", False): 120 | return None 121 | 122 | service_cls = self.cli_options.get("llm_service", None) 123 | if service_cls is None: 124 | service_cls = "marker.services.gemini.GoogleGeminiService" 125 | return service_cls 126 | 127 | def get_renderer(self): 128 | match self.cli_options["output_format"]: 129 | case "json": 130 | r = JSONRenderer 131 | case "markdown": 132 | r = MarkdownRenderer 133 | case "html": 134 | r = HTMLRenderer 135 | case "chunks": 136 | r = ChunkRenderer 137 | case _: 138 | raise ValueError("Invalid output format") 139 | return classes_to_strings([r])[0] 140 | 141 | def get_processors(self): 142 | processors = self.cli_options.get("processors", None) 143 | if processors is not None: 144 | processors = processors.split(",") 145 | for p in processors: 146 | try: 147 | strings_to_classes([p]) 148 | except Exception as e: 149 | logger.error(f"Error loading processor: {p} with error: {e}") 150 | raise 151 | 152 | return processors 153 | 154 | def get_converter_cls(self): 155 | converter_cls = self.cli_options.get("converter_cls", None) 156 | if converter_cls is not None: 157 | try: 158 | return strings_to_classes([converter_cls])[0] 159 | except Exception as e: 160 | logger.error( 161 | f"Error loading converter: {converter_cls} with error: {e}" 162 | ) 163 | raise 164 | 165 | return PdfConverter 166 | 167 | def get_output_folder(self, filepath: str): 168 | output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR) 169 | fname_base = os.path.splitext(os.path.basename(filepath))[0] 170 | output_dir = os.path.join(output_dir, fname_base) 171 | os.makedirs(output_dir, exist_ok=True) 172 | return output_dir 173 | 174 | def get_base_filename(self, filepath: str): 175 | basename = os.path.basename(filepath) 176 | return os.path.splitext(basename)[0] 177 | ``` -------------------------------------------------------------------------------- /marker/builders/layout.py: -------------------------------------------------------------------------------- ```python 1 | from typing import Annotated, List 2 | 3 | from surya.layout import LayoutPredictor 4 | from surya.layout.schema import LayoutResult, LayoutBox 5 | 6 | from marker.builders import BaseBuilder 7 | from marker.providers.pdf import PdfProvider 8 | from marker.schema import BlockTypes 9 | from marker.schema.document import Document 10 | from marker.schema.groups.page import PageGroup 11 | from marker.schema.polygon import PolygonBox 12 | from marker.schema.registry import get_block_class 13 | from marker.settings import settings 14 | 15 | 16 | class LayoutBuilder(BaseBuilder): 17 | """ 18 | A builder for performing layout detection on PDF pages and merging the results into the document. 19 | """ 20 | 21 | layout_batch_size: Annotated[ 22 | int, 23 | "The batch size to use for the layout model.", 24 | "Default is None, which will use the default batch size for the model.", 25 | ] = None 26 | force_layout_block: Annotated[ 27 | str, 28 | "Skip layout and force every page to be treated as a specific block type.", 29 | ] = None 30 | disable_tqdm: Annotated[ 31 | bool, 32 | "Disable tqdm progress bars.", 33 | ] = False 34 | expand_block_types: Annotated[ 35 | List[BlockTypes], 36 | "Block types whose bounds should be expanded to accomodate missing regions", 37 | ] = [ 38 | BlockTypes.Picture, 39 | BlockTypes.Figure, 40 | BlockTypes.ComplexRegion, 41 | ] # Does not include groups since they are only injected later 42 | max_expand_frac: Annotated[ 43 | float, "The maximum fraction to expand the layout box bounds by" 44 | ] = 0.05 45 | 46 | def __init__(self, layout_model: LayoutPredictor, config=None): 47 | self.layout_model = layout_model 48 | 49 | super().__init__(config) 50 | 51 | def __call__(self, document: Document, provider: PdfProvider): 52 | if self.force_layout_block is not None: 53 | # Assign the full content of every page to a single layout type 54 | layout_results = self.forced_layout(document.pages) 55 | else: 56 | layout_results = self.surya_layout(document.pages) 57 | self.add_blocks_to_pages(document.pages, layout_results) 58 | self.expand_layout_blocks(document) 59 | 60 | def get_batch_size(self): 61 | if self.layout_batch_size is not None: 62 | return self.layout_batch_size 63 | elif settings.TORCH_DEVICE_MODEL == "cuda": 64 | return 12 65 | return 6 66 | 67 | def forced_layout(self, pages: List[PageGroup]) -> List[LayoutResult]: 68 | layout_results = [] 69 | for page in pages: 70 | layout_results.append( 71 | LayoutResult( 72 | image_bbox=page.polygon.bbox, 73 | bboxes=[ 74 | LayoutBox( 75 | label=self.force_layout_block, 76 | position=0, 77 | top_k={self.force_layout_block: 1}, 78 | polygon=page.polygon.polygon, 79 | ), 80 | ], 81 | sliced=False, 82 | ) 83 | ) 84 | return layout_results 85 | 86 | def surya_layout(self, pages: List[PageGroup]) -> List[LayoutResult]: 87 | self.layout_model.disable_tqdm = self.disable_tqdm 88 | layout_results = self.layout_model( 89 | [p.get_image(highres=False) for p in pages], 90 | batch_size=int(self.get_batch_size()), 91 | ) 92 | return layout_results 93 | 94 | def expand_layout_blocks(self, document: Document): 95 | for page in document.pages: 96 | # Collect all blocks on this page as PolygonBox for easy access 97 | page_blocks = [document.get_block(bid) for bid in page.structure] 98 | page_size = page.polygon.size 99 | 100 | for block_id in page.structure: 101 | block = document.get_block(block_id) 102 | if block.block_type in self.expand_block_types: 103 | other_blocks = [b for b in page_blocks if b != block] 104 | if not other_blocks: 105 | block.polygon = block.polygon.expand( 106 | self.max_expand_frac, self.max_expand_frac 107 | ).fit_to_bounds((0, 0, *page_size)) 108 | continue 109 | 110 | min_gap = min( 111 | block.polygon.minimum_gap(other.polygon) 112 | for other in other_blocks 113 | ) 114 | if min_gap <= 0: 115 | continue 116 | 117 | x_expand_frac = ( 118 | min_gap / block.polygon.width if block.polygon.width > 0 else 0 119 | ) 120 | y_expand_frac = ( 121 | min_gap / block.polygon.height 122 | if block.polygon.height > 0 123 | else 0 124 | ) 125 | 126 | block.polygon = block.polygon.expand( 127 | min(self.max_expand_frac, x_expand_frac), 128 | min(self.max_expand_frac, y_expand_frac), 129 | ).fit_to_bounds((0, 0, *page_size)) 130 | 131 | def add_blocks_to_pages( 132 | self, pages: List[PageGroup], layout_results: List[LayoutResult] 133 | ): 134 | for page, layout_result in zip(pages, layout_results): 135 | layout_page_size = PolygonBox.from_bbox(layout_result.image_bbox).size 136 | provider_page_size = page.polygon.size 137 | page.layout_sliced = ( 138 | layout_result.sliced 139 | ) # This indicates if the page was sliced by the layout model 140 | for bbox in sorted(layout_result.bboxes, key=lambda x: x.position): 141 | block_cls = get_block_class(BlockTypes[bbox.label]) 142 | layout_block = page.add_block( 143 | block_cls, PolygonBox(polygon=bbox.polygon) 144 | ) 145 | layout_block.polygon = layout_block.polygon.rescale( 146 | layout_page_size, provider_page_size 147 | ).fit_to_bounds((0, 0, *provider_page_size)) 148 | layout_block.top_k = { 149 | BlockTypes[label]: prob 150 | for (label, prob) in bbox.top_k.items() 151 | if label in BlockTypes.__members__ 152 | } 153 | page.add_structure(layout_block) 154 | 155 | # Ensure page has non-empty structure 156 | if page.structure is None: 157 | page.structure = [] 158 | 159 | # Ensure page has non-empty children 160 | if page.children is None: 161 | page.children = [] 162 | ``` -------------------------------------------------------------------------------- /marker/scripts/convert.py: -------------------------------------------------------------------------------- ```python 1 | import atexit 2 | import os 3 | import time 4 | 5 | import psutil 6 | import torch 7 | 8 | from marker.utils.batch import get_batch_sizes_worker_counts 9 | 10 | # Ensure threads don't contend 11 | os.environ["MKL_DYNAMIC"] = "FALSE" 12 | os.environ["OMP_DYNAMIC"] = "FALSE" 13 | os.environ["OMP_NUM_THREADS"] = "2" # Avoid OpenMP issues with multiprocessing 14 | os.environ["OPENBLAS_NUM_THREADS"] = "2" 15 | os.environ["MKL_NUM_THREADS"] = "2" 16 | os.environ["GRPC_VERBOSITY"] = "ERROR" 17 | os.environ["GLOG_minloglevel"] = "2" 18 | os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = ( 19 | "1" # Transformers uses .isin for a simple op, which is not supported on MPS 20 | ) 21 | os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya 22 | 23 | import math 24 | import traceback 25 | 26 | import click 27 | import torch.multiprocessing as mp 28 | from tqdm import tqdm 29 | import gc 30 | 31 | from marker.config.parser import ConfigParser 32 | from marker.config.printer import CustomClickPrinter 33 | from marker.logger import configure_logging, get_logger 34 | from marker.models import create_model_dict 35 | from marker.output import output_exists, save_output 36 | from marker.utils.gpu import GPUManager 37 | 38 | configure_logging() 39 | logger = get_logger() 40 | 41 | 42 | def worker_init(): 43 | model_dict = create_model_dict() 44 | 45 | global model_refs 46 | model_refs = model_dict 47 | 48 | # Ensure we clean up the model references on exit 49 | atexit.register(worker_exit) 50 | 51 | 52 | def worker_exit(): 53 | global model_refs 54 | try: 55 | del model_refs 56 | except Exception: 57 | pass 58 | 59 | 60 | def process_single_pdf(args): 61 | page_count = 0 62 | fpath, cli_options = args 63 | torch.set_num_threads(cli_options["total_torch_threads"]) 64 | del cli_options["total_torch_threads"] 65 | 66 | config_parser = ConfigParser(cli_options) 67 | 68 | out_folder = config_parser.get_output_folder(fpath) 69 | base_name = config_parser.get_base_filename(fpath) 70 | if cli_options.get("skip_existing") and output_exists(out_folder, base_name): 71 | return page_count 72 | 73 | converter_cls = config_parser.get_converter_cls() 74 | config_dict = config_parser.generate_config_dict() 75 | config_dict["disable_tqdm"] = True 76 | 77 | try: 78 | if cli_options.get("debug_print"): 79 | logger.debug(f"Converting {fpath}") 80 | converter = converter_cls( 81 | config=config_dict, 82 | artifact_dict=model_refs, 83 | processor_list=config_parser.get_processors(), 84 | renderer=config_parser.get_renderer(), 85 | llm_service=config_parser.get_llm_service(), 86 | ) 87 | rendered = converter(fpath) 88 | out_folder = config_parser.get_output_folder(fpath) 89 | save_output(rendered, out_folder, base_name) 90 | page_count = converter.page_count 91 | 92 | if cli_options.get("debug_print"): 93 | logger.debug(f"Converted {fpath}") 94 | del rendered 95 | del converter 96 | except Exception as e: 97 | logger.error(f"Error converting {fpath}: {e}") 98 | traceback.print_exc() 99 | finally: 100 | gc.collect() 101 | 102 | return page_count 103 | 104 | 105 | @click.command(cls=CustomClickPrinter) 106 | @click.argument("in_folder", type=str) 107 | @click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert") 108 | @click.option( 109 | "--num_chunks", 110 | type=int, 111 | default=1, 112 | help="Number of chunks being processed in parallel", 113 | ) 114 | @click.option( 115 | "--max_files", type=int, default=None, help="Maximum number of pdfs to convert" 116 | ) 117 | @click.option( 118 | "--skip_existing", 119 | is_flag=True, 120 | default=False, 121 | help="Skip existing converted files.", 122 | ) 123 | @click.option( 124 | "--debug_print", is_flag=True, default=False, help="Print debug information." 125 | ) 126 | @click.option( 127 | "--max_tasks_per_worker", 128 | type=int, 129 | default=10, 130 | help="Maximum number of tasks per worker process before recycling.", 131 | ) 132 | @click.option( 133 | "--workers", 134 | type=int, 135 | default=None, 136 | help="Number of worker processes to use. Set automatically by default, but can be overridden.", 137 | ) 138 | @ConfigParser.common_options 139 | def convert_cli(in_folder: str, **kwargs): 140 | total_pages = 0 141 | in_folder = os.path.abspath(in_folder) 142 | files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)] 143 | files = [f for f in files if os.path.isfile(f)] 144 | 145 | # Handle chunks if we're processing in parallel 146 | # Ensure we get all files into a chunk 147 | chunk_size = math.ceil(len(files) / kwargs["num_chunks"]) 148 | start_idx = kwargs["chunk_idx"] * chunk_size 149 | end_idx = start_idx + chunk_size 150 | files_to_convert = files[start_idx:end_idx] 151 | 152 | # Limit files converted if needed 153 | if kwargs["max_files"]: 154 | files_to_convert = files_to_convert[: kwargs["max_files"]] 155 | 156 | # Disable nested multiprocessing 157 | kwargs["disable_multiprocessing"] = True 158 | 159 | try: 160 | mp.set_start_method("spawn") # Required for CUDA, forkserver doesn't work 161 | except RuntimeError: 162 | raise RuntimeError( 163 | "Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again." 164 | ) 165 | 166 | chunk_idx = kwargs["chunk_idx"] 167 | 168 | # Use GPU context manager for automatic setup/cleanup 169 | with GPUManager(chunk_idx) as gpu_manager: 170 | batch_sizes, workers = get_batch_sizes_worker_counts(gpu_manager, 7) 171 | 172 | # Override workers if specified 173 | if kwargs["workers"] is not None: 174 | workers = kwargs["workers"] 175 | 176 | # Set proper batch sizes and thread counts 177 | total_processes = max(1, min(len(files_to_convert), workers)) 178 | kwargs["total_torch_threads"] = max( 179 | 2, psutil.cpu_count(logical=False) // total_processes 180 | ) 181 | kwargs.update(batch_sizes) 182 | 183 | logger.info( 184 | f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}" 185 | ) 186 | task_args = [(f, kwargs) for f in files_to_convert] 187 | 188 | start_time = time.time() 189 | with mp.Pool( 190 | processes=total_processes, 191 | initializer=worker_init, 192 | maxtasksperchild=kwargs["max_tasks_per_worker"], 193 | ) as pool: 194 | pbar = tqdm(total=len(task_args), desc="Processing PDFs", unit="pdf") 195 | for page_count in pool.imap_unordered(process_single_pdf, task_args): 196 | pbar.update(1) 197 | total_pages += page_count 198 | pbar.close() 199 | 200 | total_time = time.time() - start_time 201 | print( 202 | f"Inferenced {total_pages} pages in {total_time:.2f} seconds, for a throughput of {total_pages / total_time:.2f} pages/sec for chunk {chunk_idx + 1}/{kwargs['num_chunks']}" 203 | ) 204 | ``` -------------------------------------------------------------------------------- /marker/processors/llm/__init__.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import traceback 3 | from concurrent.futures import ThreadPoolExecutor, as_completed 4 | from typing import Annotated, TypedDict, List, Sequence 5 | 6 | from pydantic import BaseModel 7 | from tqdm import tqdm 8 | from PIL import Image 9 | 10 | from marker.output import json_to_html 11 | from marker.processors import BaseProcessor 12 | from marker.schema import BlockTypes 13 | from marker.schema.blocks import Block, BlockId 14 | from marker.schema.document import Document 15 | from marker.schema.groups import PageGroup 16 | from marker.services import BaseService 17 | from marker.util import assign_config 18 | from marker.logger import get_logger 19 | 20 | logger = get_logger() 21 | 22 | 23 | class PromptData(TypedDict): 24 | prompt: str 25 | image: Image.Image 26 | block: Block 27 | schema: BaseModel 28 | page: PageGroup 29 | additional_data: dict | None 30 | 31 | 32 | class BlockData(TypedDict): 33 | page: PageGroup 34 | block: Block 35 | 36 | 37 | class BaseLLMProcessor(BaseProcessor): 38 | """ 39 | A processor for using LLMs to convert blocks. 40 | """ 41 | 42 | max_concurrency: Annotated[ 43 | int, 44 | "The maximum number of concurrent requests to make to the Gemini model.", 45 | ] = 3 46 | image_expansion_ratio: Annotated[ 47 | float, 48 | "The ratio to expand the image by when cropping.", 49 | ] = 0.01 50 | use_llm: Annotated[ 51 | bool, 52 | "Whether to use the LLM model.", 53 | ] = False 54 | disable_tqdm: Annotated[ 55 | bool, 56 | "Whether to disable the tqdm progress bar.", 57 | ] = False 58 | block_types = None 59 | 60 | def __init__(self, llm_service: BaseService, config=None): 61 | super().__init__(config) 62 | 63 | self.llm_service = None 64 | if not self.use_llm: 65 | return 66 | 67 | self.llm_service = llm_service 68 | 69 | def extract_image( 70 | self, 71 | document: Document, 72 | image_block: Block, 73 | remove_blocks: Sequence[BlockTypes] | None = None, 74 | ) -> Image.Image: 75 | return image_block.get_image( 76 | document, 77 | highres=True, 78 | expansion=(self.image_expansion_ratio, self.image_expansion_ratio), 79 | remove_blocks=remove_blocks, 80 | ) 81 | 82 | def normalize_block_json(self, block: Block, document: Document, page: PageGroup): 83 | """ 84 | Get the normalized JSON representation of a block for the LLM. 85 | """ 86 | page_width = page.polygon.width 87 | page_height = page.polygon.height 88 | block_bbox = block.polygon.bbox 89 | 90 | # Normalize bbox to 0-1000 range 91 | normalized_bbox = [ 92 | (block_bbox[0] / page_width) * 1000, 93 | (block_bbox[1] / page_height) * 1000, 94 | (block_bbox[2] / page_width) * 1000, 95 | (block_bbox[3] / page_height) * 1000, 96 | ] 97 | 98 | block_json = { 99 | "id": str(block.id), 100 | "block_type": str(block.id.block_type), 101 | "bbox": normalized_bbox, 102 | "html": json_to_html(block.render(document)), 103 | } 104 | 105 | return block_json 106 | 107 | def load_blocks(self, response: dict): 108 | return [json.loads(block) for block in response["blocks"]] 109 | 110 | def handle_rewrites(self, blocks: list, document: Document): 111 | for block_data in blocks: 112 | try: 113 | block_id = block_data["id"].strip().lstrip("/") 114 | _, page_id, block_type, block_id = block_id.split("/") 115 | block_id = BlockId( 116 | page_id=page_id, 117 | block_id=block_id, 118 | block_type=getattr(BlockTypes, block_type), 119 | ) 120 | block = document.get_block(block_id) 121 | if not block: 122 | logger.debug(f"Block {block_id} not found in document") 123 | continue 124 | 125 | if hasattr(block, "html"): 126 | block.html = block_data["html"] 127 | except Exception as e: 128 | logger.debug(f"Error parsing block ID {block_data['id']}: {e}") 129 | continue 130 | 131 | 132 | class BaseLLMComplexBlockProcessor(BaseLLMProcessor): 133 | """ 134 | A processor for using LLMs to convert blocks with more complex logic. 135 | """ 136 | 137 | def __call__(self, document: Document): 138 | if not self.use_llm or self.llm_service is None: 139 | return 140 | 141 | try: 142 | self.rewrite_blocks(document) 143 | except Exception as e: 144 | logger.warning(f"Error rewriting blocks in {self.__class__.__name__}: {e}") 145 | 146 | def process_rewriting(self, document: Document, page: PageGroup, block: Block): 147 | raise NotImplementedError() 148 | 149 | def rewrite_blocks(self, document: Document): 150 | # Don't show progress if there are no blocks to process 151 | total_blocks = sum( 152 | len(page.contained_blocks(document, self.block_types)) 153 | for page in document.pages 154 | ) 155 | if total_blocks == 0: 156 | return 157 | 158 | pbar = tqdm( 159 | total=total_blocks, 160 | desc=f"{self.__class__.__name__} running", 161 | disable=self.disable_tqdm 162 | ) 163 | with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor: 164 | for future in as_completed( 165 | [ 166 | executor.submit(self.process_rewriting, document, page, block) 167 | for page in document.pages 168 | for block in page.contained_blocks(document, self.block_types) 169 | ] 170 | ): 171 | future.result() # Raise exceptions if any occurred 172 | pbar.update(1) 173 | 174 | pbar.close() 175 | 176 | 177 | class BaseLLMSimpleBlockProcessor(BaseLLMProcessor): 178 | """ 179 | A processor for using LLMs to convert single blocks. 180 | """ 181 | 182 | # Override init since we don't need an llmservice here 183 | def __init__(self, config=None): 184 | assign_config(self, config) 185 | 186 | def __call__(self, result: dict, prompt_data: PromptData, document: Document): 187 | try: 188 | self.rewrite_block(result, prompt_data, document) 189 | except Exception as e: 190 | logger.warning(f"Error rewriting block in {self.__class__.__name__}: {e}") 191 | traceback.print_exc() 192 | 193 | def inference_blocks(self, document: Document) -> List[BlockData]: 194 | blocks = [] 195 | for page in document.pages: 196 | for block in page.contained_blocks(document, self.block_types): 197 | blocks.append({"page": page, "block": block}) 198 | return blocks 199 | 200 | def block_prompts(self, document: Document) -> List[PromptData]: 201 | raise NotImplementedError() 202 | 203 | def rewrite_block( 204 | self, response: dict, prompt_data: PromptData, document: Document 205 | ): 206 | raise NotImplementedError() 207 | ``` -------------------------------------------------------------------------------- /benchmarks/overall/scorers/llm.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import os 3 | import tempfile 4 | import time 5 | from typing import List 6 | 7 | from PIL import Image 8 | from google.genai.errors import APIError 9 | from google import genai 10 | import pypdfium2 as pdfium 11 | 12 | from benchmarks.overall.scorers import BaseScorer, BlockScores 13 | from marker.settings import settings 14 | 15 | rating_prompt = """ 16 | You're a document analysis expert who is comparing some markdown to an image to make sure the markdown is correct. You're rating how effectively the provided markdown represents the full text and formatting in the image provided. 17 | You're given an image, along with the extracted markdown: 18 | - Some parts of the page may have been recognized as images and linked from the markdown, like ``. 19 | - Tables will be formatted as Github flavored markdown. 20 | - Block equations will be in LaTeX. 21 | - The image and markdown may be in any language. 22 | - The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text. 23 | 24 | The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided. 25 | 26 | **Instructions** 27 | Follow this process to evaluate the markdown: 28 | 1. Carefully examine the image. 29 | 2. Carefully examine the markdown input provided. 30 | 3. Compare the image to the markdown representation. Does the markdown representation properly represent the important text and formatting in the image? 31 | 4. Assign component scores, as described below. 32 | 33 | These are the primary scores: 34 | - Overall - the overall quality of the markdown as compared to the image. 35 | - Text quality - the quality of the text extraction from the image. 36 | - Formatting quality - the quality of the formatting applied to the markdown, as compared to the image. 37 | 38 | Depending on which elements are present in the markdown, you will assign element-specific scores. 39 | - Tables - how effectively the tables have been extracted and formatted. 40 | - Forms - how effectively the forms have extracted and formatted. 41 | - Equations - how effectively block equations have been converted to LaTeX. 42 | - Section headers - if all of the section headers have been detected, and the right levels set. 43 | - Lists - if the lists have been properly extracted and formatted. 44 | - Images - if images are identified and placed correctly. 45 | 46 | Notes on scoring: 47 | - To get a 5/5, all of the important text from the image must appear in the markdown, and the formatting should be correct (minor mistakes okay). It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings. If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text. 48 | - A 3/5 may have small missing text elements from the markdown and/or moderate formatting issues. 49 | - A 1/5 will have major missing text segments from the markdown or completely unreadable formatting. 50 | - Use 0/5 if a field isn't applicable, like if the image doesn't contain a table. 51 | 52 | If text that is important to the meaning of the document is missing, do not score higher than 3/5. 53 | 54 | Output json, like in the example below. 55 | 56 | **Example** 57 | Input 58 | ```markdown 59 | # Section 1 60 | This is some *markdown* extracted from a document. Here is a block equation: 61 | $$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$ 62 | ``` 63 | Output 64 | ```json 65 | { 66 | "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.", 67 | "markdown_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.", 68 | "comparison": "The text and formatting matches the image. There are no formatting or text extraction issues. The equations and section headers are correct.", 69 | "overall": 5, 70 | "text": 5, 71 | "formatting": 5, 72 | "section_headers": 5, 73 | "tables": 0, 74 | "forms": 0, 75 | "equations": 5, 76 | "lists": 0, 77 | "images": 0 78 | } 79 | ``` 80 | **Input** 81 | ```markdown 82 | {{markdown}} 83 | ``` 84 | **Output** 85 | """ 86 | 87 | comparison_keys = ["comparison"] 88 | description_keys = ["image_description", "markdown_description"] 89 | text_keys = comparison_keys + description_keys 90 | score_keys = ["overall", "text", "formatting", "section_headers", "tables", "forms", "equations", 91 | "lists", "images"] 92 | 93 | 94 | class LLMScorer(BaseScorer): 95 | def __call__(self, sample, gt_markdown: List[str], markdown: str) -> BlockScores: 96 | pdf_bytes = sample["pdf"] 97 | with tempfile.NamedTemporaryFile(suffix=".pdf") as f: 98 | f.write(pdf_bytes) 99 | f.flush() 100 | f.seek(0) 101 | doc = pdfium.PdfDocument(f.name) 102 | img = doc[0].render(scale=96/72).to_pil() 103 | doc.close() 104 | 105 | return self.llm_rater(img, markdown) 106 | 107 | 108 | def llm_rater(self, img: Image.Image, markdown: str) -> BlockScores: 109 | if not markdown: 110 | null_scores = {k: 1 for k in score_keys} 111 | text_scores = {k: "" for k in text_keys} 112 | null_scores.update(text_scores) 113 | return { 114 | "score": 1, 115 | "specific_scores": null_scores 116 | } 117 | req_keys = text_keys + score_keys 118 | properties = {} 119 | for key in req_keys: 120 | content_type = "INTEGER" if key in score_keys else "STRING" 121 | properties[key] = {"type": content_type} 122 | 123 | response_schema = { 124 | "required": req_keys, 125 | "properties": properties, 126 | "type": "OBJECT" 127 | } 128 | prompt = rating_prompt.replace("{{markdown}}", markdown) 129 | response = self.llm_response_wrapper([img, prompt], response_schema) 130 | assert all([k in response for k in req_keys]), f"Missing keys in response: {response}" 131 | return { 132 | "score": response["overall"], 133 | "specific_scores": response, 134 | } 135 | 136 | def llm_response_wrapper(self, prompt, response_schema, depth=0): 137 | client = genai.Client( 138 | http_options={"timeout": 60000}, 139 | vertexai=True, 140 | project=os.getenv("VERTEX_PROJECT_ID"), 141 | location=os.getenv("VERTEX_LOCATION"), 142 | ) 143 | try: 144 | responses = client.models.generate_content( 145 | model="gemini-2.0-flash-001", 146 | contents=prompt, 147 | config={ 148 | "temperature": 0, 149 | "response_schema": response_schema, 150 | "response_mime_type": "application/json", 151 | }, 152 | ) 153 | output = responses.candidates[0].content.parts[0].text 154 | return json.loads(output) 155 | except APIError as e: 156 | print(f"Hit Gemini rate limit, waiting 120 seconds") 157 | time.sleep(120) 158 | if depth > 2: 159 | raise e 160 | return self.llm_response_wrapper(prompt, response_schema, depth + 1) ``` -------------------------------------------------------------------------------- /tests/processors/test_llm_processors.py: -------------------------------------------------------------------------------- ```python 1 | from unittest.mock import MagicMock, Mock 2 | 3 | import pytest 4 | from marker.processors.llm.llm_complex import LLMComplexRegionProcessor 5 | from marker.processors.llm.llm_equation import LLMEquationProcessor 6 | 7 | from marker.processors.llm.llm_form import LLMFormProcessor 8 | from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor 9 | from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor 10 | from marker.processors.llm.llm_table import LLMTableProcessor 11 | from marker.processors.table import TableProcessor 12 | from marker.renderers.markdown import MarkdownRenderer 13 | from marker.schema import BlockTypes 14 | from marker.schema.blocks import ComplexRegion 15 | 16 | 17 | @pytest.mark.filename("form_1040.pdf") 18 | @pytest.mark.config({"page_range": [0]}) 19 | def test_llm_form_processor_no_config(pdf_document, llm_service): 20 | processor_lst = [LLMFormProcessor()] 21 | processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service) 22 | processor(pdf_document) 23 | 24 | forms = pdf_document.contained_blocks((BlockTypes.Form,)) 25 | assert forms[0].html is None 26 | 27 | 28 | @pytest.mark.filename("form_1040.pdf") 29 | @pytest.mark.config({"page_range": [0]}) 30 | def test_llm_form_processor_no_cells(pdf_document, llm_service): 31 | config = {"use_llm": True, "gemini_api_key": "test"} 32 | processor_lst = [LLMFormProcessor(config)] 33 | processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service, config) 34 | processor(pdf_document) 35 | 36 | forms = pdf_document.contained_blocks((BlockTypes.Form,)) 37 | assert forms[0].html is None 38 | 39 | 40 | @pytest.mark.filename("form_1040.pdf") 41 | @pytest.mark.config({"page_range": [0]}) 42 | def test_llm_form_processor(pdf_document, table_rec_model, recognition_model, detection_model): 43 | corrected_html = "<em>This is corrected markdown.</em>\n" * 100 44 | corrected_html = "<p>" + corrected_html.strip() + "</p>\n" 45 | 46 | mock_cls = Mock() 47 | mock_cls.return_value = {"corrected_html": corrected_html} 48 | 49 | cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model) 50 | cell_processor(pdf_document) 51 | 52 | config = {"use_llm": True, "gemini_api_key": "test"} 53 | processor_lst = [LLMFormProcessor(config)] 54 | processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) 55 | processor(pdf_document) 56 | 57 | forms = pdf_document.contained_blocks((BlockTypes.Form,)) 58 | assert forms[0].html == corrected_html.strip() 59 | 60 | 61 | 62 | @pytest.mark.filename("table_ex2.pdf") 63 | @pytest.mark.config({"page_range": [0]}) 64 | def test_llm_table_processor(pdf_document, table_rec_model, recognition_model, detection_model): 65 | corrected_html = """ 66 | <table> 67 | <tr> 68 | <td>Column 1</td> 69 | <td>Column 2</td> 70 | <td>Column 3</td> 71 | <td>Column 4</td> 72 | </tr> 73 | <tr> 74 | <td>Value 1 <math>x</math></td> 75 | <td>Value 2</td> 76 | <td>Value 3</td> 77 | <td>Value 4</td> 78 | </tr> 79 | <tr> 80 | <td>Value 5</td> 81 | <td>Value 6</td> 82 | <td>Value 7</td> 83 | <td>Value 8</td> 84 | </tr> 85 | </table> 86 | """.strip() 87 | 88 | mock_cls = Mock() 89 | mock_cls.return_value = {"corrected_html": corrected_html} 90 | 91 | cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model) 92 | cell_processor(pdf_document) 93 | 94 | processor = LLMTableProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"}) 95 | processor(pdf_document) 96 | 97 | tables = pdf_document.contained_blocks((BlockTypes.Table,)) 98 | table_cells = tables[0].contained_blocks(pdf_document, (BlockTypes.TableCell,)) 99 | assert table_cells[0].text == "Column 1" 100 | 101 | markdown = MarkdownRenderer()(pdf_document).markdown 102 | assert "Value 1 $x$" in markdown 103 | 104 | 105 | @pytest.mark.filename("A17_FlightPlan.pdf") 106 | @pytest.mark.config({"page_range": [0]}) 107 | def test_llm_caption_processor_disabled(pdf_document): 108 | config = {"use_llm": True, "gemini_api_key": "test"} 109 | mock_cls = MagicMock() 110 | processor_lst = [LLMImageDescriptionProcessor(config)] 111 | processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) 112 | processor(pdf_document) 113 | 114 | contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure)) 115 | assert all(picture.description is None for picture in contained_pictures) 116 | 117 | @pytest.mark.filename("A17_FlightPlan.pdf") 118 | @pytest.mark.config({"page_range": [0]}) 119 | def test_llm_caption_processor(pdf_document): 120 | description = "This is an image description." 121 | mock_cls = Mock() 122 | mock_cls.return_value = {"image_description": description} 123 | 124 | config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False} 125 | processor_lst = [LLMImageDescriptionProcessor(config)] 126 | processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) 127 | processor(pdf_document) 128 | 129 | contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure)) 130 | assert all(picture.description == description for picture in contained_pictures) 131 | 132 | # Ensure the rendering includes the description 133 | renderer = MarkdownRenderer({"extract_images": False}) 134 | md = renderer(pdf_document).markdown 135 | 136 | assert description in md 137 | 138 | 139 | @pytest.mark.filename("A17_FlightPlan.pdf") 140 | @pytest.mark.config({"page_range": [0]}) 141 | def test_llm_complex_region_processor(pdf_document): 142 | md = "This is some *markdown* for a complex region." 143 | mock_cls = Mock() 144 | mock_cls.return_value = {"corrected_markdown": md * 25} 145 | 146 | # Replace the block with a complex region 147 | old_block = pdf_document.pages[0].children[0] 148 | new_block = ComplexRegion( 149 | **old_block.dict(exclude=["id", "block_id", "block_type"]), 150 | ) 151 | pdf_document.pages[0].replace_block(old_block, new_block) 152 | 153 | # Test processor 154 | config = {"use_llm": True, "gemini_api_key": "test"} 155 | processor_lst = [LLMComplexRegionProcessor(config)] 156 | processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) 157 | processor(pdf_document) 158 | 159 | # Ensure the rendering includes the description 160 | renderer = MarkdownRenderer() 161 | rendered_md = renderer(pdf_document).markdown 162 | 163 | assert md in rendered_md 164 | 165 | @pytest.mark.filename("adversarial.pdf") 166 | @pytest.mark.config({"page_range": [0]}) 167 | def test_multi_llm_processors(pdf_document): 168 | description = "<math>This is an image description. And here is a lot of writing about it.</math>" * 10 169 | mock_cls = Mock() 170 | mock_cls.return_value = {"image_description": description, "corrected_equation": description} 171 | 172 | config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False, "min_equation_height": .001} 173 | processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)] 174 | processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) 175 | processor(pdf_document) 176 | 177 | contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure)) 178 | assert all(picture.description == description for picture in contained_pictures) 179 | 180 | contained_equations = pdf_document.contained_blocks((BlockTypes.Equation,)) 181 | print([equation.html for equation in contained_equations]) 182 | assert all(equation.html == description for equation in contained_equations) ``` -------------------------------------------------------------------------------- /benchmarks/table/inference.py: -------------------------------------------------------------------------------- ```python 1 | from typing import List 2 | 3 | import numpy as np 4 | from bs4 import BeautifulSoup 5 | import pypdfium2 as pdfium 6 | from tqdm import tqdm 7 | import base64 8 | import tempfile 9 | 10 | from benchmarks.table.gemini import gemini_table_rec 11 | from marker.config.parser import ConfigParser 12 | from marker.converters.table import TableConverter 13 | from marker.models import create_model_dict 14 | from marker.processors.llm.llm_table import LLMTableProcessor 15 | from marker.processors.table import TableProcessor 16 | from marker.renderers.json import JSONBlockOutput 17 | from marker.schema.polygon import PolygonBox 18 | from marker.util import matrix_intersection_area 19 | 20 | 21 | def extract_tables(children: List[JSONBlockOutput]): 22 | tables = [] 23 | for child in children: 24 | if child.block_type == 'Table': 25 | tables.append(child) 26 | elif child.children: 27 | tables.extend(extract_tables(child.children)) 28 | return tables 29 | 30 | def fix_table_html(table_html: str) -> str: 31 | marker_table_soup = BeautifulSoup(table_html, 'html.parser') 32 | tbody = marker_table_soup.find('tbody') 33 | if tbody: 34 | tbody.unwrap() 35 | for th_tag in marker_table_soup.find_all('th'): 36 | th_tag.name = 'td' 37 | for br_tag in marker_table_soup.find_all('br'): 38 | br_tag.replace_with(marker_table_soup.new_string('')) 39 | 40 | marker_table_html = str(marker_table_soup) 41 | marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines 42 | return marker_table_html 43 | 44 | 45 | def inference_tables(dataset, use_llm: bool, table_rec_batch_size: int | None, max_rows: int, use_gemini: bool): 46 | models = create_model_dict() 47 | config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True}) 48 | total_unaligned = 0 49 | results = [] 50 | 51 | iterations = len(dataset) 52 | if max_rows is not None: 53 | iterations = min(max_rows, len(dataset)) 54 | 55 | for i in tqdm(range(iterations), desc='Converting Tables'): 56 | try: 57 | row = dataset[i] 58 | pdf_binary = base64.b64decode(row['pdf']) 59 | gt_tables = row['tables'] # Already sorted by reading order, which is what marker returns 60 | 61 | # Only use the basic table processors 62 | converter = TableConverter( 63 | config=config_parser.generate_config_dict(), 64 | artifact_dict=models, 65 | processor_list=[ 66 | "marker.processors.table.TableProcessor", 67 | "marker.processors.llm.llm_table.LLMTableProcessor", 68 | ], 69 | renderer=config_parser.get_renderer() 70 | ) 71 | 72 | with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file: 73 | temp_pdf_file.write(pdf_binary) 74 | temp_pdf_file.seek(0) 75 | marker_json = converter(temp_pdf_file.name).children 76 | 77 | doc = pdfium.PdfDocument(temp_pdf_file.name) 78 | page_image = doc[0].render(scale=96/72).to_pil() 79 | doc.close() 80 | 81 | if len(marker_json) == 0 or len(gt_tables) == 0: 82 | print(f'No tables detected, skipping...') 83 | total_unaligned += len(gt_tables) 84 | continue 85 | 86 | marker_tables = extract_tables(marker_json) 87 | marker_table_boxes = [table.bbox for table in marker_tables] 88 | page_bbox = marker_json[0].bbox 89 | 90 | if len(marker_tables) != len(gt_tables): 91 | print(f'Number of tables do not match, skipping...') 92 | total_unaligned += len(gt_tables) 93 | continue 94 | 95 | table_images = [ 96 | page_image.crop( 97 | PolygonBox.from_bbox(bbox) 98 | .rescale( 99 | (page_bbox[2], page_bbox[3]), (page_image.width, page_image.height) 100 | ).bbox 101 | ) 102 | for bbox 103 | in marker_table_boxes 104 | ] 105 | 106 | # Normalize the bboxes 107 | for bbox in marker_table_boxes: 108 | bbox[0] = bbox[0] / page_bbox[2] 109 | bbox[1] = bbox[1] / page_bbox[3] 110 | bbox[2] = bbox[2] / page_bbox[2] 111 | bbox[3] = bbox[3] / page_bbox[3] 112 | 113 | gt_boxes = [table['normalized_bbox'] for table in gt_tables] 114 | gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes] 115 | marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes] 116 | table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes) 117 | 118 | aligned_tables = [] 119 | used_tables = set() 120 | unaligned_tables = set() 121 | for table_idx, alignment in enumerate(table_alignments): 122 | try: 123 | max_area = np.max(alignment) 124 | aligned_idx = np.argmax(alignment) 125 | except ValueError: 126 | # No alignment found 127 | unaligned_tables.add(table_idx) 128 | continue 129 | 130 | if max_area <= .01: 131 | # No alignment found 132 | unaligned_tables.add(table_idx) 133 | continue 134 | 135 | if aligned_idx in used_tables: 136 | # Marker table already aligned with another gt table 137 | unaligned_tables.add(table_idx) 138 | continue 139 | 140 | # Gt table doesn't align well with any marker table 141 | gt_table_pct = gt_areas[table_idx] / max_area 142 | if not .85 < gt_table_pct < 1.15: 143 | unaligned_tables.add(table_idx) 144 | continue 145 | 146 | # Marker table doesn't align with gt table 147 | marker_table_pct = marker_areas[aligned_idx] / max_area 148 | if not .85 < marker_table_pct < 1.15: 149 | unaligned_tables.add(table_idx) 150 | continue 151 | 152 | gemini_html = "" 153 | if use_gemini: 154 | try: 155 | gemini_html = gemini_table_rec(table_images[aligned_idx]) 156 | except Exception as e: 157 | print(f'Gemini failed: {e}') 158 | 159 | aligned_tables.append( 160 | (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html) 161 | ) 162 | used_tables.add(aligned_idx) 163 | 164 | total_unaligned += len(unaligned_tables) 165 | 166 | for marker_table, gt_table, gemini_table in aligned_tables: 167 | gt_table_html = gt_table['html'] 168 | 169 | # marker wraps the table in <tbody> which fintabnet data doesn't 170 | # Fintabnet doesn't use th tags, need to be replaced for fair comparison 171 | marker_table_html = fix_table_html(marker_table.html) 172 | gemini_table_html = fix_table_html(gemini_table) 173 | 174 | results.append({ 175 | "marker_table": marker_table_html, 176 | "gt_table": gt_table_html, 177 | "gemini_table": gemini_table_html 178 | }) 179 | except pdfium.PdfiumError: 180 | print('Broken PDF, Skipping...') 181 | continue 182 | return results, total_unaligned ```