This is page 3 of 7. Use http://codebase.md/datalab-to/marker?lines=false&page={x} to view the full context. # Directory Structure ``` ├── .github │ ├── ISSUE_TEMPLATE │ │ ├── breaking-bug-report.md │ │ ├── feature_request.md │ │ └── output-bug-report.md │ └── workflows │ ├── benchmarks.yml │ ├── ci.yml │ ├── cla.yml │ ├── publish.yml │ └── scripts.yml ├── .gitignore ├── .pre-commit-config.yaml ├── benchmarks │ ├── __init__.py │ ├── overall │ │ ├── __init__.py │ │ ├── display │ │ │ ├── __init__.py │ │ │ ├── dataset.py │ │ │ └── table.py │ │ ├── download │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── llamaparse.py │ │ │ ├── main.py │ │ │ ├── mathpix.py │ │ │ └── mistral.py │ │ ├── elo.py │ │ ├── methods │ │ │ ├── __init__.py │ │ │ ├── docling.py │ │ │ ├── gt.py │ │ │ ├── llamaparse.py │ │ │ ├── marker.py │ │ │ ├── mathpix.py │ │ │ ├── mistral.py │ │ │ ├── olmocr.py │ │ │ └── schema.py │ │ ├── overall.py │ │ ├── registry.py │ │ ├── schema.py │ │ └── scorers │ │ ├── __init__.py │ │ ├── clean.py │ │ ├── heuristic.py │ │ ├── llm.py │ │ └── schema.py │ ├── table │ │ ├── __init__.py │ │ ├── gemini.py │ │ ├── inference.py │ │ ├── scoring.py │ │ └── table.py │ ├── throughput │ │ ├── __init__.py │ │ └── main.py │ └── verify_scores.py ├── chunk_convert.py ├── CLA.md ├── convert_single.py ├── convert.py ├── data │ ├── .gitignore │ ├── examples │ │ ├── json │ │ │ ├── multicolcnn.json │ │ │ ├── switch_trans.json │ │ │ └── thinkpython.json │ │ └── markdown │ │ ├── multicolcnn │ │ │ ├── _page_1_Figure_0.jpeg │ │ │ ├── _page_2_Picture_0.jpeg │ │ │ ├── _page_6_Figure_0.jpeg │ │ │ ├── _page_7_Figure_0.jpeg │ │ │ ├── multicolcnn_meta.json │ │ │ └── multicolcnn.md │ │ ├── switch_transformers │ │ │ ├── _page_11_Figure_4.jpeg │ │ │ ├── _page_12_Figure_4.jpeg │ │ │ ├── _page_13_Figure_2.jpeg │ │ │ ├── _page_18_Figure_1.jpeg │ │ │ ├── _page_18_Figure_3.jpeg │ │ │ ├── _page_2_Figure_3.jpeg │ │ │ ├── _page_20_Figure_1.jpeg │ │ │ ├── _page_20_Figure_4.jpeg │ │ │ ├── _page_27_Figure_1.jpeg │ │ │ ├── _page_29_Figure_1.jpeg │ │ │ ├── _page_30_Figure_1.jpeg │ │ │ ├── _page_31_Figure_3.jpeg │ │ │ ├── _page_4_Figure_1.jpeg │ │ │ ├── _page_5_Figure_3.jpeg │ │ │ ├── switch_trans_meta.json │ │ │ └── switch_trans.md │ │ └── thinkpython │ │ ├── _page_109_Figure_1.jpeg │ │ ├── _page_115_Figure_1.jpeg │ │ ├── _page_116_Figure_3.jpeg │ │ ├── _page_127_Figure_1.jpeg │ │ ├── _page_128_Figure_1.jpeg │ │ ├── _page_167_Figure_1.jpeg │ │ ├── _page_169_Figure_1.jpeg │ │ ├── _page_173_Figure_1.jpeg │ │ ├── _page_190_Figure_1.jpeg │ │ ├── _page_195_Figure_1.jpeg │ │ ├── _page_205_Figure_1.jpeg │ │ ├── _page_23_Figure_1.jpeg │ │ ├── _page_23_Figure_3.jpeg │ │ ├── _page_230_Figure_1.jpeg │ │ ├── _page_233_Figure_1.jpeg │ │ ├── _page_233_Figure_3.jpeg │ │ ├── _page_234_Figure_1.jpeg │ │ ├── _page_235_Figure_1.jpeg │ │ ├── _page_236_Figure_1.jpeg │ │ ├── _page_236_Figure_3.jpeg │ │ ├── _page_237_Figure_1.jpeg │ │ ├── _page_238_Figure_1.jpeg │ │ ├── _page_46_Figure_1.jpeg │ │ ├── _page_60_Figure_1.jpeg │ │ ├── _page_60_Figure_3.jpeg │ │ ├── _page_67_Figure_1.jpeg │ │ ├── _page_71_Figure_1.jpeg │ │ ├── _page_78_Figure_1.jpeg │ │ ├── _page_85_Figure_1.jpeg │ │ ├── _page_94_Figure_1.jpeg │ │ ├── _page_99_Figure_17.jpeg │ │ ├── _page_99_Figure_178.jpeg │ │ ├── thinkpython_meta.json │ │ └── thinkpython.md │ ├── images │ │ ├── overall.png │ │ ├── per_doc.png │ │ └── table.png │ └── latex_to_md.sh ├── examples │ ├── marker_modal_deployment.py │ └── README.md ├── extraction_app.py ├── LICENSE ├── marker │ ├── builders │ │ ├── __init__.py │ │ ├── document.py │ │ ├── layout.py │ │ ├── line.py │ │ ├── ocr.py │ │ └── structure.py │ ├── config │ │ ├── __init__.py │ │ ├── crawler.py │ │ ├── parser.py │ │ └── printer.py │ ├── converters │ │ ├── __init__.py │ │ ├── extraction.py │ │ ├── ocr.py │ │ ├── pdf.py │ │ └── table.py │ ├── extractors │ │ ├── __init__.py │ │ ├── document.py │ │ └── page.py │ ├── logger.py │ ├── models.py │ ├── output.py │ ├── processors │ │ ├── __init__.py │ │ ├── blank_page.py │ │ ├── block_relabel.py │ │ ├── blockquote.py │ │ ├── code.py │ │ ├── debug.py │ │ ├── document_toc.py │ │ ├── equation.py │ │ ├── footnote.py │ │ ├── ignoretext.py │ │ ├── line_merge.py │ │ ├── line_numbers.py │ │ ├── list.py │ │ ├── llm │ │ │ ├── __init__.py │ │ │ ├── llm_complex.py │ │ │ ├── llm_equation.py │ │ │ ├── llm_form.py │ │ │ ├── llm_handwriting.py │ │ │ ├── llm_image_description.py │ │ │ ├── llm_mathblock.py │ │ │ ├── llm_meta.py │ │ │ ├── llm_page_correction.py │ │ │ ├── llm_sectionheader.py │ │ │ ├── llm_table_merge.py │ │ │ └── llm_table.py │ │ ├── order.py │ │ ├── page_header.py │ │ ├── reference.py │ │ ├── sectionheader.py │ │ ├── table.py │ │ ├── text.py │ │ └── util.py │ ├── providers │ │ ├── __init__.py │ │ ├── document.py │ │ ├── epub.py │ │ ├── html.py │ │ ├── image.py │ │ ├── pdf.py │ │ ├── powerpoint.py │ │ ├── registry.py │ │ ├── spreadsheet.py │ │ └── utils.py │ ├── renderers │ │ ├── __init__.py │ │ ├── chunk.py │ │ ├── extraction.py │ │ ├── html.py │ │ ├── json.py │ │ ├── markdown.py │ │ └── ocr_json.py │ ├── schema │ │ ├── __init__.py │ │ ├── blocks │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── basetable.py │ │ │ ├── caption.py │ │ │ ├── code.py │ │ │ ├── complexregion.py │ │ │ ├── equation.py │ │ │ ├── figure.py │ │ │ ├── footnote.py │ │ │ ├── form.py │ │ │ ├── handwriting.py │ │ │ ├── inlinemath.py │ │ │ ├── listitem.py │ │ │ ├── pagefooter.py │ │ │ ├── pageheader.py │ │ │ ├── picture.py │ │ │ ├── reference.py │ │ │ ├── sectionheader.py │ │ │ ├── table.py │ │ │ ├── tablecell.py │ │ │ ├── text.py │ │ │ └── toc.py │ │ ├── document.py │ │ ├── groups │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── figure.py │ │ │ ├── list.py │ │ │ ├── page.py │ │ │ ├── picture.py │ │ │ └── table.py │ │ ├── polygon.py │ │ ├── registry.py │ │ └── text │ │ ├── __init__.py │ │ ├── char.py │ │ ├── line.py │ │ └── span.py │ ├── scripts │ │ ├── __init__.py │ │ ├── chunk_convert.py │ │ ├── chunk_convert.sh │ │ ├── common.py │ │ ├── convert_single.py │ │ ├── convert.py │ │ ├── extraction_app.py │ │ ├── file_to_s3.py │ │ ├── run_streamlit_app.py │ │ ├── server.py │ │ └── streamlit_app.py │ ├── services │ │ ├── __init__.py │ │ ├── azure_openai.py │ │ ├── claude.py │ │ ├── gemini.py │ │ ├── ollama.py │ │ ├── openai.py │ │ └── vertex.py │ ├── settings.py │ ├── util.py │ └── utils │ ├── __init__.py │ ├── batch.py │ ├── gpu.py │ └── image.py ├── marker_app.py ├── marker_server.py ├── poetry.lock ├── pyproject.toml ├── pytest.ini ├── README.md ├── signatures │ └── version1 │ └── cla.json ├── static │ └── fonts │ └── .gitignore └── tests ├── builders │ ├── test_blank_page.py │ ├── test_document_builder.py │ ├── test_garbled_pdf.py │ ├── test_layout_replace.py │ ├── test_ocr_builder.py │ ├── test_ocr_pipeline.py │ ├── test_overriding.py │ ├── test_pdf_links.py │ ├── test_rotated_bboxes.py │ ├── test_strip_existing_ocr.py │ └── test_structure.py ├── config │ └── test_config.py ├── conftest.py ├── converters │ ├── test_extraction_converter.py │ ├── test_ocr_converter.py │ ├── test_pdf_converter.py │ └── test_table_converter.py ├── processors │ ├── test_document_toc_processor.py │ ├── test_equation_processor.py │ ├── test_footnote_processor.py │ ├── test_ignoretext.py │ ├── test_llm_processors.py │ ├── test_table_merge.py │ └── test_table_processor.py ├── providers │ ├── test_document_providers.py │ ├── test_image_provider.py │ └── test_pdf_provider.py ├── renderers │ ├── test_chunk_renderer.py │ ├── test_extract_images.py │ ├── test_html_renderer.py │ ├── test_json_renderer.py │ └── test_markdown_renderer.py ├── schema │ └── groups │ └── test_list_grouping.py ├── services │ └── test_service_init.py └── utils.py ``` # Files -------------------------------------------------------------------------------- /marker/config/parser.py: -------------------------------------------------------------------------------- ```python import json import os from typing import Dict import click from marker.converters.pdf import PdfConverter from marker.logger import get_logger from marker.renderers.chunk import ChunkRenderer from marker.renderers.html import HTMLRenderer from marker.renderers.json import JSONRenderer from marker.renderers.markdown import MarkdownRenderer from marker.settings import settings from marker.util import classes_to_strings, parse_range_str, strings_to_classes logger = get_logger() class ConfigParser: def __init__(self, cli_options: dict): self.cli_options = cli_options @staticmethod def common_options(fn): fn = click.option( "--output_dir", type=click.Path(exists=False), required=False, default=settings.OUTPUT_DIR, help="Directory to save output.", )(fn) fn = click.option("--debug", "-d", is_flag=True, help="Enable debug mode.")(fn) fn = click.option( "--output_format", type=click.Choice(["markdown", "json", "html", "chunks"]), default="markdown", help="Format to output results in.", )(fn) fn = click.option( "--processors", type=str, default=None, help="Comma separated list of processors to use. Must use full module path.", )(fn) fn = click.option( "--config_json", type=str, default=None, help="Path to JSON file with additional configuration.", )(fn) fn = click.option( "--disable_multiprocessing", is_flag=True, default=False, help="Disable multiprocessing.", )(fn) fn = click.option( "--disable_image_extraction", is_flag=True, default=False, help="Disable image extraction.", )(fn) # these are options that need a list transformation, i.e splitting/parsing a string fn = click.option( "--page_range", type=str, default=None, help="Page range to convert, specify comma separated page numbers or ranges. Example: 0,5-10,20", )(fn) # we put common options here fn = click.option( "--converter_cls", type=str, default=None, help="Converter class to use. Defaults to PDF converter.", )(fn) fn = click.option( "--llm_service", type=str, default=None, help="LLM service to use - should be full import path, like marker.services.gemini.GoogleGeminiService", )(fn) return fn def generate_config_dict(self) -> Dict[str, any]: config = {} output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR) for k, v in self.cli_options.items(): if not v: continue match k: case "debug": config["debug_pdf_images"] = True config["debug_layout_images"] = True config["debug_json"] = True config["debug_data_folder"] = output_dir case "page_range": config["page_range"] = parse_range_str(v) case "config_json": with open(v, "r", encoding="utf-8") as f: config.update(json.load(f)) case "disable_multiprocessing": config["pdftext_workers"] = 1 case "disable_image_extraction": config["extract_images"] = False case _: config[k] = v # Backward compatibility for google_api_key if settings.GOOGLE_API_KEY: config["gemini_api_key"] = settings.GOOGLE_API_KEY return config def get_llm_service(self): # Only return an LLM service when use_llm is enabled if not self.cli_options.get("use_llm", False): return None service_cls = self.cli_options.get("llm_service", None) if service_cls is None: service_cls = "marker.services.gemini.GoogleGeminiService" return service_cls def get_renderer(self): match self.cli_options["output_format"]: case "json": r = JSONRenderer case "markdown": r = MarkdownRenderer case "html": r = HTMLRenderer case "chunks": r = ChunkRenderer case _: raise ValueError("Invalid output format") return classes_to_strings([r])[0] def get_processors(self): processors = self.cli_options.get("processors", None) if processors is not None: processors = processors.split(",") for p in processors: try: strings_to_classes([p]) except Exception as e: logger.error(f"Error loading processor: {p} with error: {e}") raise return processors def get_converter_cls(self): converter_cls = self.cli_options.get("converter_cls", None) if converter_cls is not None: try: return strings_to_classes([converter_cls])[0] except Exception as e: logger.error( f"Error loading converter: {converter_cls} with error: {e}" ) raise return PdfConverter def get_output_folder(self, filepath: str): output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR) fname_base = os.path.splitext(os.path.basename(filepath))[0] output_dir = os.path.join(output_dir, fname_base) os.makedirs(output_dir, exist_ok=True) return output_dir def get_base_filename(self, filepath: str): basename = os.path.basename(filepath) return os.path.splitext(basename)[0] ``` -------------------------------------------------------------------------------- /marker/builders/layout.py: -------------------------------------------------------------------------------- ```python from typing import Annotated, List from surya.layout import LayoutPredictor from surya.layout.schema import LayoutResult, LayoutBox from marker.builders import BaseBuilder from marker.providers.pdf import PdfProvider from marker.schema import BlockTypes from marker.schema.document import Document from marker.schema.groups.page import PageGroup from marker.schema.polygon import PolygonBox from marker.schema.registry import get_block_class from marker.settings import settings class LayoutBuilder(BaseBuilder): """ A builder for performing layout detection on PDF pages and merging the results into the document. """ layout_batch_size: Annotated[ int, "The batch size to use for the layout model.", "Default is None, which will use the default batch size for the model.", ] = None force_layout_block: Annotated[ str, "Skip layout and force every page to be treated as a specific block type.", ] = None disable_tqdm: Annotated[ bool, "Disable tqdm progress bars.", ] = False expand_block_types: Annotated[ List[BlockTypes], "Block types whose bounds should be expanded to accomodate missing regions", ] = [ BlockTypes.Picture, BlockTypes.Figure, BlockTypes.ComplexRegion, ] # Does not include groups since they are only injected later max_expand_frac: Annotated[ float, "The maximum fraction to expand the layout box bounds by" ] = 0.05 def __init__(self, layout_model: LayoutPredictor, config=None): self.layout_model = layout_model super().__init__(config) def __call__(self, document: Document, provider: PdfProvider): if self.force_layout_block is not None: # Assign the full content of every page to a single layout type layout_results = self.forced_layout(document.pages) else: layout_results = self.surya_layout(document.pages) self.add_blocks_to_pages(document.pages, layout_results) self.expand_layout_blocks(document) def get_batch_size(self): if self.layout_batch_size is not None: return self.layout_batch_size elif settings.TORCH_DEVICE_MODEL == "cuda": return 12 return 6 def forced_layout(self, pages: List[PageGroup]) -> List[LayoutResult]: layout_results = [] for page in pages: layout_results.append( LayoutResult( image_bbox=page.polygon.bbox, bboxes=[ LayoutBox( label=self.force_layout_block, position=0, top_k={self.force_layout_block: 1}, polygon=page.polygon.polygon, ), ], sliced=False, ) ) return layout_results def surya_layout(self, pages: List[PageGroup]) -> List[LayoutResult]: self.layout_model.disable_tqdm = self.disable_tqdm layout_results = self.layout_model( [p.get_image(highres=False) for p in pages], batch_size=int(self.get_batch_size()), ) return layout_results def expand_layout_blocks(self, document: Document): for page in document.pages: # Collect all blocks on this page as PolygonBox for easy access page_blocks = [document.get_block(bid) for bid in page.structure] page_size = page.polygon.size for block_id in page.structure: block = document.get_block(block_id) if block.block_type in self.expand_block_types: other_blocks = [b for b in page_blocks if b != block] if not other_blocks: block.polygon = block.polygon.expand( self.max_expand_frac, self.max_expand_frac ).fit_to_bounds((0, 0, *page_size)) continue min_gap = min( block.polygon.minimum_gap(other.polygon) for other in other_blocks ) if min_gap <= 0: continue x_expand_frac = ( min_gap / block.polygon.width if block.polygon.width > 0 else 0 ) y_expand_frac = ( min_gap / block.polygon.height if block.polygon.height > 0 else 0 ) block.polygon = block.polygon.expand( min(self.max_expand_frac, x_expand_frac), min(self.max_expand_frac, y_expand_frac), ).fit_to_bounds((0, 0, *page_size)) def add_blocks_to_pages( self, pages: List[PageGroup], layout_results: List[LayoutResult] ): for page, layout_result in zip(pages, layout_results): layout_page_size = PolygonBox.from_bbox(layout_result.image_bbox).size provider_page_size = page.polygon.size page.layout_sliced = ( layout_result.sliced ) # This indicates if the page was sliced by the layout model for bbox in sorted(layout_result.bboxes, key=lambda x: x.position): block_cls = get_block_class(BlockTypes[bbox.label]) layout_block = page.add_block( block_cls, PolygonBox(polygon=bbox.polygon) ) layout_block.polygon = layout_block.polygon.rescale( layout_page_size, provider_page_size ).fit_to_bounds((0, 0, *provider_page_size)) layout_block.top_k = { BlockTypes[label]: prob for (label, prob) in bbox.top_k.items() if label in BlockTypes.__members__ } page.add_structure(layout_block) # Ensure page has non-empty structure if page.structure is None: page.structure = [] # Ensure page has non-empty children if page.children is None: page.children = [] ``` -------------------------------------------------------------------------------- /marker/scripts/convert.py: -------------------------------------------------------------------------------- ```python import atexit import os import time import psutil import torch from marker.utils.batch import get_batch_sizes_worker_counts # Ensure threads don't contend os.environ["MKL_DYNAMIC"] = "FALSE" os.environ["OMP_DYNAMIC"] = "FALSE" os.environ["OMP_NUM_THREADS"] = "2" # Avoid OpenMP issues with multiprocessing os.environ["OPENBLAS_NUM_THREADS"] = "2" os.environ["MKL_NUM_THREADS"] = "2" os.environ["GRPC_VERBOSITY"] = "ERROR" os.environ["GLOG_minloglevel"] = "2" os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = ( "1" # Transformers uses .isin for a simple op, which is not supported on MPS ) os.environ["IN_STREAMLIT"] = "true" # Avoid multiprocessing inside surya import math import traceback import click import torch.multiprocessing as mp from tqdm import tqdm import gc from marker.config.parser import ConfigParser from marker.config.printer import CustomClickPrinter from marker.logger import configure_logging, get_logger from marker.models import create_model_dict from marker.output import output_exists, save_output from marker.utils.gpu import GPUManager configure_logging() logger = get_logger() def worker_init(): model_dict = create_model_dict() global model_refs model_refs = model_dict # Ensure we clean up the model references on exit atexit.register(worker_exit) def worker_exit(): global model_refs try: del model_refs except Exception: pass def process_single_pdf(args): page_count = 0 fpath, cli_options = args torch.set_num_threads(cli_options["total_torch_threads"]) del cli_options["total_torch_threads"] config_parser = ConfigParser(cli_options) out_folder = config_parser.get_output_folder(fpath) base_name = config_parser.get_base_filename(fpath) if cli_options.get("skip_existing") and output_exists(out_folder, base_name): return page_count converter_cls = config_parser.get_converter_cls() config_dict = config_parser.generate_config_dict() config_dict["disable_tqdm"] = True try: if cli_options.get("debug_print"): logger.debug(f"Converting {fpath}") converter = converter_cls( config=config_dict, artifact_dict=model_refs, processor_list=config_parser.get_processors(), renderer=config_parser.get_renderer(), llm_service=config_parser.get_llm_service(), ) rendered = converter(fpath) out_folder = config_parser.get_output_folder(fpath) save_output(rendered, out_folder, base_name) page_count = converter.page_count if cli_options.get("debug_print"): logger.debug(f"Converted {fpath}") del rendered del converter except Exception as e: logger.error(f"Error converting {fpath}: {e}") traceback.print_exc() finally: gc.collect() return page_count @click.command(cls=CustomClickPrinter) @click.argument("in_folder", type=str) @click.option("--chunk_idx", type=int, default=0, help="Chunk index to convert") @click.option( "--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel", ) @click.option( "--max_files", type=int, default=None, help="Maximum number of pdfs to convert" ) @click.option( "--skip_existing", is_flag=True, default=False, help="Skip existing converted files.", ) @click.option( "--debug_print", is_flag=True, default=False, help="Print debug information." ) @click.option( "--max_tasks_per_worker", type=int, default=10, help="Maximum number of tasks per worker process before recycling.", ) @click.option( "--workers", type=int, default=None, help="Number of worker processes to use. Set automatically by default, but can be overridden.", ) @ConfigParser.common_options def convert_cli(in_folder: str, **kwargs): total_pages = 0 in_folder = os.path.abspath(in_folder) files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)] files = [f for f in files if os.path.isfile(f)] # Handle chunks if we're processing in parallel # Ensure we get all files into a chunk chunk_size = math.ceil(len(files) / kwargs["num_chunks"]) start_idx = kwargs["chunk_idx"] * chunk_size end_idx = start_idx + chunk_size files_to_convert = files[start_idx:end_idx] # Limit files converted if needed if kwargs["max_files"]: files_to_convert = files_to_convert[: kwargs["max_files"]] # Disable nested multiprocessing kwargs["disable_multiprocessing"] = True try: mp.set_start_method("spawn") # Required for CUDA, forkserver doesn't work except RuntimeError: raise RuntimeError( "Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again." ) chunk_idx = kwargs["chunk_idx"] # Use GPU context manager for automatic setup/cleanup with GPUManager(chunk_idx) as gpu_manager: batch_sizes, workers = get_batch_sizes_worker_counts(gpu_manager, 7) # Override workers if specified if kwargs["workers"] is not None: workers = kwargs["workers"] # Set proper batch sizes and thread counts total_processes = max(1, min(len(files_to_convert), workers)) kwargs["total_torch_threads"] = max( 2, psutil.cpu_count(logical=False) // total_processes ) kwargs.update(batch_sizes) logger.info( f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}" ) task_args = [(f, kwargs) for f in files_to_convert] start_time = time.time() with mp.Pool( processes=total_processes, initializer=worker_init, maxtasksperchild=kwargs["max_tasks_per_worker"], ) as pool: pbar = tqdm(total=len(task_args), desc="Processing PDFs", unit="pdf") for page_count in pool.imap_unordered(process_single_pdf, task_args): pbar.update(1) total_pages += page_count pbar.close() total_time = time.time() - start_time print( f"Inferenced {total_pages} pages in {total_time:.2f} seconds, for a throughput of {total_pages / total_time:.2f} pages/sec for chunk {chunk_idx + 1}/{kwargs['num_chunks']}" ) ``` -------------------------------------------------------------------------------- /marker/processors/llm/__init__.py: -------------------------------------------------------------------------------- ```python import json import traceback from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Annotated, TypedDict, List, Sequence from pydantic import BaseModel from tqdm import tqdm from PIL import Image from marker.output import json_to_html from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.blocks import Block, BlockId from marker.schema.document import Document from marker.schema.groups import PageGroup from marker.services import BaseService from marker.util import assign_config from marker.logger import get_logger logger = get_logger() class PromptData(TypedDict): prompt: str image: Image.Image block: Block schema: BaseModel page: PageGroup additional_data: dict | None class BlockData(TypedDict): page: PageGroup block: Block class BaseLLMProcessor(BaseProcessor): """ A processor for using LLMs to convert blocks. """ max_concurrency: Annotated[ int, "The maximum number of concurrent requests to make to the Gemini model.", ] = 3 image_expansion_ratio: Annotated[ float, "The ratio to expand the image by when cropping.", ] = 0.01 use_llm: Annotated[ bool, "Whether to use the LLM model.", ] = False disable_tqdm: Annotated[ bool, "Whether to disable the tqdm progress bar.", ] = False block_types = None def __init__(self, llm_service: BaseService, config=None): super().__init__(config) self.llm_service = None if not self.use_llm: return self.llm_service = llm_service def extract_image( self, document: Document, image_block: Block, remove_blocks: Sequence[BlockTypes] | None = None, ) -> Image.Image: return image_block.get_image( document, highres=True, expansion=(self.image_expansion_ratio, self.image_expansion_ratio), remove_blocks=remove_blocks, ) def normalize_block_json(self, block: Block, document: Document, page: PageGroup): """ Get the normalized JSON representation of a block for the LLM. """ page_width = page.polygon.width page_height = page.polygon.height block_bbox = block.polygon.bbox # Normalize bbox to 0-1000 range normalized_bbox = [ (block_bbox[0] / page_width) * 1000, (block_bbox[1] / page_height) * 1000, (block_bbox[2] / page_width) * 1000, (block_bbox[3] / page_height) * 1000, ] block_json = { "id": str(block.id), "block_type": str(block.id.block_type), "bbox": normalized_bbox, "html": json_to_html(block.render(document)), } return block_json def load_blocks(self, response: dict): return [json.loads(block) for block in response["blocks"]] def handle_rewrites(self, blocks: list, document: Document): for block_data in blocks: try: block_id = block_data["id"].strip().lstrip("/") _, page_id, block_type, block_id = block_id.split("/") block_id = BlockId( page_id=page_id, block_id=block_id, block_type=getattr(BlockTypes, block_type), ) block = document.get_block(block_id) if not block: logger.debug(f"Block {block_id} not found in document") continue if hasattr(block, "html"): block.html = block_data["html"] except Exception as e: logger.debug(f"Error parsing block ID {block_data['id']}: {e}") continue class BaseLLMComplexBlockProcessor(BaseLLMProcessor): """ A processor for using LLMs to convert blocks with more complex logic. """ def __call__(self, document: Document): if not self.use_llm or self.llm_service is None: return try: self.rewrite_blocks(document) except Exception as e: logger.warning(f"Error rewriting blocks in {self.__class__.__name__}: {e}") def process_rewriting(self, document: Document, page: PageGroup, block: Block): raise NotImplementedError() def rewrite_blocks(self, document: Document): # Don't show progress if there are no blocks to process total_blocks = sum( len(page.contained_blocks(document, self.block_types)) for page in document.pages ) if total_blocks == 0: return pbar = tqdm( total=total_blocks, desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm ) with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor: for future in as_completed( [ executor.submit(self.process_rewriting, document, page, block) for page in document.pages for block in page.contained_blocks(document, self.block_types) ] ): future.result() # Raise exceptions if any occurred pbar.update(1) pbar.close() class BaseLLMSimpleBlockProcessor(BaseLLMProcessor): """ A processor for using LLMs to convert single blocks. """ # Override init since we don't need an llmservice here def __init__(self, config=None): assign_config(self, config) def __call__(self, result: dict, prompt_data: PromptData, document: Document): try: self.rewrite_block(result, prompt_data, document) except Exception as e: logger.warning(f"Error rewriting block in {self.__class__.__name__}: {e}") traceback.print_exc() def inference_blocks(self, document: Document) -> List[BlockData]: blocks = [] for page in document.pages: for block in page.contained_blocks(document, self.block_types): blocks.append({"page": page, "block": block}) return blocks def block_prompts(self, document: Document) -> List[PromptData]: raise NotImplementedError() def rewrite_block( self, response: dict, prompt_data: PromptData, document: Document ): raise NotImplementedError() ``` -------------------------------------------------------------------------------- /benchmarks/overall/scorers/llm.py: -------------------------------------------------------------------------------- ```python import json import os import tempfile import time from typing import List from PIL import Image from google.genai.errors import APIError from google import genai import pypdfium2 as pdfium from benchmarks.overall.scorers import BaseScorer, BlockScores from marker.settings import settings rating_prompt = """ You're a document analysis expert who is comparing some markdown to an image to make sure the markdown is correct. You're rating how effectively the provided markdown represents the full text and formatting in the image provided. You're given an image, along with the extracted markdown: - Some parts of the page may have been recognized as images and linked from the markdown, like ``. - Tables will be formatted as Github flavored markdown. - Block equations will be in LaTeX. - The image and markdown may be in any language. - The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text. The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided. **Instructions** Follow this process to evaluate the markdown: 1. Carefully examine the image. 2. Carefully examine the markdown input provided. 3. Compare the image to the markdown representation. Does the markdown representation properly represent the important text and formatting in the image? 4. Assign component scores, as described below. These are the primary scores: - Overall - the overall quality of the markdown as compared to the image. - Text quality - the quality of the text extraction from the image. - Formatting quality - the quality of the formatting applied to the markdown, as compared to the image. Depending on which elements are present in the markdown, you will assign element-specific scores. - Tables - how effectively the tables have been extracted and formatted. - Forms - how effectively the forms have extracted and formatted. - Equations - how effectively block equations have been converted to LaTeX. - Section headers - if all of the section headers have been detected, and the right levels set. - Lists - if the lists have been properly extracted and formatted. - Images - if images are identified and placed correctly. Notes on scoring: - To get a 5/5, all of the important text from the image must appear in the markdown, and the formatting should be correct (minor mistakes okay). It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings. If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text. - A 3/5 may have small missing text elements from the markdown and/or moderate formatting issues. - A 1/5 will have major missing text segments from the markdown or completely unreadable formatting. - Use 0/5 if a field isn't applicable, like if the image doesn't contain a table. If text that is important to the meaning of the document is missing, do not score higher than 3/5. Output json, like in the example below. **Example** Input ```markdown # Section 1 This is some *markdown* extracted from a document. Here is a block equation: $$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$ ``` Output ```json { "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.", "markdown_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.", "comparison": "The text and formatting matches the image. There are no formatting or text extraction issues. The equations and section headers are correct.", "overall": 5, "text": 5, "formatting": 5, "section_headers": 5, "tables": 0, "forms": 0, "equations": 5, "lists": 0, "images": 0 } ``` **Input** ```markdown {{markdown}} ``` **Output** """ comparison_keys = ["comparison"] description_keys = ["image_description", "markdown_description"] text_keys = comparison_keys + description_keys score_keys = ["overall", "text", "formatting", "section_headers", "tables", "forms", "equations", "lists", "images"] class LLMScorer(BaseScorer): def __call__(self, sample, gt_markdown: List[str], markdown: str) -> BlockScores: pdf_bytes = sample["pdf"] with tempfile.NamedTemporaryFile(suffix=".pdf") as f: f.write(pdf_bytes) f.flush() f.seek(0) doc = pdfium.PdfDocument(f.name) img = doc[0].render(scale=96/72).to_pil() doc.close() return self.llm_rater(img, markdown) def llm_rater(self, img: Image.Image, markdown: str) -> BlockScores: if not markdown: null_scores = {k: 1 for k in score_keys} text_scores = {k: "" for k in text_keys} null_scores.update(text_scores) return { "score": 1, "specific_scores": null_scores } req_keys = text_keys + score_keys properties = {} for key in req_keys: content_type = "INTEGER" if key in score_keys else "STRING" properties[key] = {"type": content_type} response_schema = { "required": req_keys, "properties": properties, "type": "OBJECT" } prompt = rating_prompt.replace("{{markdown}}", markdown) response = self.llm_response_wrapper([img, prompt], response_schema) assert all([k in response for k in req_keys]), f"Missing keys in response: {response}" return { "score": response["overall"], "specific_scores": response, } def llm_response_wrapper(self, prompt, response_schema, depth=0): client = genai.Client( http_options={"timeout": 60000}, vertexai=True, project=os.getenv("VERTEX_PROJECT_ID"), location=os.getenv("VERTEX_LOCATION"), ) try: responses = client.models.generate_content( model="gemini-2.0-flash-001", contents=prompt, config={ "temperature": 0, "response_schema": response_schema, "response_mime_type": "application/json", }, ) output = responses.candidates[0].content.parts[0].text return json.loads(output) except APIError as e: print(f"Hit Gemini rate limit, waiting 120 seconds") time.sleep(120) if depth > 2: raise e return self.llm_response_wrapper(prompt, response_schema, depth + 1) ``` -------------------------------------------------------------------------------- /tests/processors/test_llm_processors.py: -------------------------------------------------------------------------------- ```python from unittest.mock import MagicMock, Mock import pytest from marker.processors.llm.llm_complex import LLMComplexRegionProcessor from marker.processors.llm.llm_equation import LLMEquationProcessor from marker.processors.llm.llm_form import LLMFormProcessor from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor from marker.processors.llm.llm_table import LLMTableProcessor from marker.processors.table import TableProcessor from marker.renderers.markdown import MarkdownRenderer from marker.schema import BlockTypes from marker.schema.blocks import ComplexRegion @pytest.mark.filename("form_1040.pdf") @pytest.mark.config({"page_range": [0]}) def test_llm_form_processor_no_config(pdf_document, llm_service): processor_lst = [LLMFormProcessor()] processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service) processor(pdf_document) forms = pdf_document.contained_blocks((BlockTypes.Form,)) assert forms[0].html is None @pytest.mark.filename("form_1040.pdf") @pytest.mark.config({"page_range": [0]}) def test_llm_form_processor_no_cells(pdf_document, llm_service): config = {"use_llm": True, "gemini_api_key": "test"} processor_lst = [LLMFormProcessor(config)] processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service, config) processor(pdf_document) forms = pdf_document.contained_blocks((BlockTypes.Form,)) assert forms[0].html is None @pytest.mark.filename("form_1040.pdf") @pytest.mark.config({"page_range": [0]}) def test_llm_form_processor(pdf_document, table_rec_model, recognition_model, detection_model): corrected_html = "<em>This is corrected markdown.</em>\n" * 100 corrected_html = "<p>" + corrected_html.strip() + "</p>\n" mock_cls = Mock() mock_cls.return_value = {"corrected_html": corrected_html} cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model) cell_processor(pdf_document) config = {"use_llm": True, "gemini_api_key": "test"} processor_lst = [LLMFormProcessor(config)] processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) processor(pdf_document) forms = pdf_document.contained_blocks((BlockTypes.Form,)) assert forms[0].html == corrected_html.strip() @pytest.mark.filename("table_ex2.pdf") @pytest.mark.config({"page_range": [0]}) def test_llm_table_processor(pdf_document, table_rec_model, recognition_model, detection_model): corrected_html = """ <table> <tr> <td>Column 1</td> <td>Column 2</td> <td>Column 3</td> <td>Column 4</td> </tr> <tr> <td>Value 1 <math>x</math></td> <td>Value 2</td> <td>Value 3</td> <td>Value 4</td> </tr> <tr> <td>Value 5</td> <td>Value 6</td> <td>Value 7</td> <td>Value 8</td> </tr> </table> """.strip() mock_cls = Mock() mock_cls.return_value = {"corrected_html": corrected_html} cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model) cell_processor(pdf_document) processor = LLMTableProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"}) processor(pdf_document) tables = pdf_document.contained_blocks((BlockTypes.Table,)) table_cells = tables[0].contained_blocks(pdf_document, (BlockTypes.TableCell,)) assert table_cells[0].text == "Column 1" markdown = MarkdownRenderer()(pdf_document).markdown assert "Value 1 $x$" in markdown @pytest.mark.filename("A17_FlightPlan.pdf") @pytest.mark.config({"page_range": [0]}) def test_llm_caption_processor_disabled(pdf_document): config = {"use_llm": True, "gemini_api_key": "test"} mock_cls = MagicMock() processor_lst = [LLMImageDescriptionProcessor(config)] processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) processor(pdf_document) contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure)) assert all(picture.description is None for picture in contained_pictures) @pytest.mark.filename("A17_FlightPlan.pdf") @pytest.mark.config({"page_range": [0]}) def test_llm_caption_processor(pdf_document): description = "This is an image description." mock_cls = Mock() mock_cls.return_value = {"image_description": description} config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False} processor_lst = [LLMImageDescriptionProcessor(config)] processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) processor(pdf_document) contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure)) assert all(picture.description == description for picture in contained_pictures) # Ensure the rendering includes the description renderer = MarkdownRenderer({"extract_images": False}) md = renderer(pdf_document).markdown assert description in md @pytest.mark.filename("A17_FlightPlan.pdf") @pytest.mark.config({"page_range": [0]}) def test_llm_complex_region_processor(pdf_document): md = "This is some *markdown* for a complex region." mock_cls = Mock() mock_cls.return_value = {"corrected_markdown": md * 25} # Replace the block with a complex region old_block = pdf_document.pages[0].children[0] new_block = ComplexRegion( **old_block.dict(exclude=["id", "block_id", "block_type"]), ) pdf_document.pages[0].replace_block(old_block, new_block) # Test processor config = {"use_llm": True, "gemini_api_key": "test"} processor_lst = [LLMComplexRegionProcessor(config)] processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) processor(pdf_document) # Ensure the rendering includes the description renderer = MarkdownRenderer() rendered_md = renderer(pdf_document).markdown assert md in rendered_md @pytest.mark.filename("adversarial.pdf") @pytest.mark.config({"page_range": [0]}) def test_multi_llm_processors(pdf_document): description = "<math>This is an image description. And here is a lot of writing about it.</math>" * 10 mock_cls = Mock() mock_cls.return_value = {"image_description": description, "corrected_equation": description} config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False, "min_equation_height": .001} processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)] processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) processor(pdf_document) contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure)) assert all(picture.description == description for picture in contained_pictures) contained_equations = pdf_document.contained_blocks((BlockTypes.Equation,)) print([equation.html for equation in contained_equations]) assert all(equation.html == description for equation in contained_equations) ``` -------------------------------------------------------------------------------- /benchmarks/table/inference.py: -------------------------------------------------------------------------------- ```python from typing import List import numpy as np from bs4 import BeautifulSoup import pypdfium2 as pdfium from tqdm import tqdm import base64 import tempfile from benchmarks.table.gemini import gemini_table_rec from marker.config.parser import ConfigParser from marker.converters.table import TableConverter from marker.models import create_model_dict from marker.processors.llm.llm_table import LLMTableProcessor from marker.processors.table import TableProcessor from marker.renderers.json import JSONBlockOutput from marker.schema.polygon import PolygonBox from marker.util import matrix_intersection_area def extract_tables(children: List[JSONBlockOutput]): tables = [] for child in children: if child.block_type == 'Table': tables.append(child) elif child.children: tables.extend(extract_tables(child.children)) return tables def fix_table_html(table_html: str) -> str: marker_table_soup = BeautifulSoup(table_html, 'html.parser') tbody = marker_table_soup.find('tbody') if tbody: tbody.unwrap() for th_tag in marker_table_soup.find_all('th'): th_tag.name = 'td' for br_tag in marker_table_soup.find_all('br'): br_tag.replace_with(marker_table_soup.new_string('')) marker_table_html = str(marker_table_soup) marker_table_html = marker_table_html.replace("\n", " ") # Fintabnet uses spaces instead of newlines return marker_table_html def inference_tables(dataset, use_llm: bool, table_rec_batch_size: int | None, max_rows: int, use_gemini: bool): models = create_model_dict() config_parser = ConfigParser({'output_format': 'json', "use_llm": use_llm, "table_rec_batch_size": table_rec_batch_size, "disable_tqdm": True}) total_unaligned = 0 results = [] iterations = len(dataset) if max_rows is not None: iterations = min(max_rows, len(dataset)) for i in tqdm(range(iterations), desc='Converting Tables'): try: row = dataset[i] pdf_binary = base64.b64decode(row['pdf']) gt_tables = row['tables'] # Already sorted by reading order, which is what marker returns # Only use the basic table processors converter = TableConverter( config=config_parser.generate_config_dict(), artifact_dict=models, processor_list=[ "marker.processors.table.TableProcessor", "marker.processors.llm.llm_table.LLMTableProcessor", ], renderer=config_parser.get_renderer() ) with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as temp_pdf_file: temp_pdf_file.write(pdf_binary) temp_pdf_file.seek(0) marker_json = converter(temp_pdf_file.name).children doc = pdfium.PdfDocument(temp_pdf_file.name) page_image = doc[0].render(scale=96/72).to_pil() doc.close() if len(marker_json) == 0 or len(gt_tables) == 0: print(f'No tables detected, skipping...') total_unaligned += len(gt_tables) continue marker_tables = extract_tables(marker_json) marker_table_boxes = [table.bbox for table in marker_tables] page_bbox = marker_json[0].bbox if len(marker_tables) != len(gt_tables): print(f'Number of tables do not match, skipping...') total_unaligned += len(gt_tables) continue table_images = [ page_image.crop( PolygonBox.from_bbox(bbox) .rescale( (page_bbox[2], page_bbox[3]), (page_image.width, page_image.height) ).bbox ) for bbox in marker_table_boxes ] # Normalize the bboxes for bbox in marker_table_boxes: bbox[0] = bbox[0] / page_bbox[2] bbox[1] = bbox[1] / page_bbox[3] bbox[2] = bbox[2] / page_bbox[2] bbox[3] = bbox[3] / page_bbox[3] gt_boxes = [table['normalized_bbox'] for table in gt_tables] gt_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in gt_boxes] marker_areas = [(bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) for bbox in marker_table_boxes] table_alignments = matrix_intersection_area(gt_boxes, marker_table_boxes) aligned_tables = [] used_tables = set() unaligned_tables = set() for table_idx, alignment in enumerate(table_alignments): try: max_area = np.max(alignment) aligned_idx = np.argmax(alignment) except ValueError: # No alignment found unaligned_tables.add(table_idx) continue if max_area <= .01: # No alignment found unaligned_tables.add(table_idx) continue if aligned_idx in used_tables: # Marker table already aligned with another gt table unaligned_tables.add(table_idx) continue # Gt table doesn't align well with any marker table gt_table_pct = gt_areas[table_idx] / max_area if not .85 < gt_table_pct < 1.15: unaligned_tables.add(table_idx) continue # Marker table doesn't align with gt table marker_table_pct = marker_areas[aligned_idx] / max_area if not .85 < marker_table_pct < 1.15: unaligned_tables.add(table_idx) continue gemini_html = "" if use_gemini: try: gemini_html = gemini_table_rec(table_images[aligned_idx]) except Exception as e: print(f'Gemini failed: {e}') aligned_tables.append( (marker_tables[aligned_idx], gt_tables[table_idx], gemini_html) ) used_tables.add(aligned_idx) total_unaligned += len(unaligned_tables) for marker_table, gt_table, gemini_table in aligned_tables: gt_table_html = gt_table['html'] # marker wraps the table in <tbody> which fintabnet data doesn't # Fintabnet doesn't use th tags, need to be replaced for fair comparison marker_table_html = fix_table_html(marker_table.html) gemini_table_html = fix_table_html(gemini_table) results.append({ "marker_table": marker_table_html, "gt_table": gt_table_html, "gemini_table": gemini_table_html }) except pdfium.PdfiumError: print('Broken PDF, Skipping...') continue return results, total_unaligned ``` -------------------------------------------------------------------------------- /marker/scripts/extraction_app.py: -------------------------------------------------------------------------------- ```python import json import os from streamlit_ace import st_ace from pydantic import BaseModel from marker.converters.extraction import ExtractionConverter from marker.scripts.common import ( parse_args, load_models, get_page_image, page_count, get_root_class, ) os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" os.environ["IN_STREAMLIT"] = "true" from streamlit.runtime.uploaded_file_manager import UploadedFile import tempfile from typing import Any, Dict import streamlit as st from marker.config.parser import ConfigParser def extract_data( fname: str, config: dict, schema: str, markdown: str | None = None ) -> (str, Dict[str, Any], dict): config["pdftext_workers"] = 1 config["page_schema"] = schema config["existing_markdown"] = markdown config_parser = ConfigParser(config) config_dict = config_parser.generate_config_dict() converter_cls = ExtractionConverter converter = converter_cls( config=config_dict, artifact_dict=model_dict, processor_list=config_parser.get_processors(), renderer=config_parser.get_renderer(), llm_service=config_parser.get_llm_service(), ) return converter(fname) st.set_page_config(layout="wide") col1, col2 = st.columns([0.5, 0.5]) model_dict = load_models() cli_options = parse_args() st.markdown(""" # Marker Extraction Demo This app will let you use marker to do structured extraction. Warning: This can execute untrusted code entered into the schema panel. """) in_file: UploadedFile = st.sidebar.file_uploader( "PDF, document, or image file:", type=["pdf", "png", "jpg", "jpeg", "gif", "pptx", "docx", "xlsx", "html", "epub"], ) # Initialize session state variables if "rendered_pydantic_schema" not in st.session_state: st.session_state.rendered_pydantic_schema = "" if "markdown" not in st.session_state: st.session_state.markdown = "" if "current_file_id" not in st.session_state: st.session_state.current_file_id = None # Detect file changes and clear markdown when new file is uploaded if in_file is not None: # Create a unique identifier for the current file current_file_id = f"{in_file.name}_{in_file.size}_{hash(in_file.getvalue())}" # Check if this is a new file if st.session_state.current_file_id != current_file_id: st.session_state.current_file_id = current_file_id st.session_state.markdown = "" # Clear markdown for new file else: # No file uploaded, clear the current file ID if st.session_state.current_file_id is not None: st.session_state.current_file_id = None st.session_state.markdown = "" # Clear markdown when no file st.session_state.rendered_pydantic_schema = "" if in_file is None: st.stop() filetype = in_file.type with col1: page_count = page_count(in_file) page_number = st.number_input( f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count ) pil_image = get_page_image(in_file, page_number) st.image(pil_image, use_container_width=True) with col2: tab1, tab2 = st.tabs(["JSON Schema", "Pydantic Schema"]) # Initialize schema variable schema = None with tab1: st.write("Enter an existing JSON schema here:") default_json_value = ( st.session_state.rendered_pydantic_schema if st.session_state.rendered_pydantic_schema else "" ) json_schema_input = st.text_area( "JSON Schema", value=default_json_value, height=300, placeholder='{"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}}', key="json_schema_input", label_visibility="collapsed", ) # Set schema if JSON input is provided if json_schema_input and json_schema_input.strip(): try: # Validate JSON json.loads(json_schema_input) schema = json_schema_input.strip() st.success("✅ Valid JSON schema detected") except json.JSONDecodeError as e: st.error(f"❌ Invalid JSON: {e}") schema = None with tab2: st.write("Enter pydantic schema here:") pydantic_schema_input = st_ace( value="""from pydantic import BaseModel class Schema(BaseModel): # Add your fields here # Example: name: str age: int # email: str pass""", language="python", height=300, key="pydantic_editor", ) render_schema = st.button("🔄 Render Pydantic schema to JSON") if render_schema and pydantic_schema_input: try: pydantic_root: BaseModel = get_root_class(pydantic_schema_input) json_schema = pydantic_root.model_json_schema() schema = json.dumps(json_schema, indent=2) st.success("✅ Schema rendered successfully!") st.json(json_schema) st.session_state.rendered_pydantic_schema = schema except Exception as e: st.error(f"❌ Could not parse your schema: {e}") schema = None elif ( pydantic_schema_input and pydantic_schema_input.strip() and not render_schema ): # If there's Pydantic code but not rendered yet, show a message if ( "class Schema(BaseModel):" in pydantic_schema_input and "pass" not in pydantic_schema_input ): st.info( "💡 Click 'Render Pydantic schema to JSON' to convert your Pydantic model to JSON schema" ) # Move the run logic outside of col2 run_marker = st.sidebar.button("Run Extraction") use_llm = st.sidebar.checkbox( "Use LLM", help="Use LLM for higher quality text", value=False ) force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False) strip_existing_ocr = st.sidebar.checkbox( "Strip existing OCR", help="Strip existing OCR text from the PDF and re-OCR.", value=False, ) # Check if schema is provided before running if run_marker: if not schema: st.error( "❌ Please provide a schema in either the JSON Schema or Pydantic Schema tab before running extraction." ) st.stop() # Run Marker with tempfile.TemporaryDirectory() as tmp_dir: temp_pdf = os.path.join(tmp_dir, "temp.pdf") with open(temp_pdf, "wb") as f: f.write(in_file.getvalue()) cli_options.update( { "force_ocr": force_ocr, "use_llm": use_llm, "strip_existing_ocr": strip_existing_ocr, } ) try: rendered = extract_data( temp_pdf, cli_options, schema, st.session_state.markdown ) with col2: st.write("## Output JSON") st.json(rendered.model_dump(exclude=["original_markdown"])) st.session_state.markdown = rendered.original_markdown except Exception as e: st.error(f"❌ Extraction failed: {e}") else: # Show instruction when not running if not schema: st.info("📝 Please provide a schema and click 'Run Extraction' to begin.") ``` -------------------------------------------------------------------------------- /benchmarks/overall/overall.py: -------------------------------------------------------------------------------- ```python import json import os import traceback from collections import defaultdict from pathlib import Path from typing import List import click import datasets import torch from tqdm import tqdm from benchmarks.overall.display.dataset import build_dataset from benchmarks.overall.registry import SCORE_REGISTRY, METHOD_REGISTRY from benchmarks.overall.schema import FullResult from marker.logger import configure_logging from marker.models import create_model_dict from marker.settings import settings from benchmarks.overall.display.table import print_scores configure_logging() def get_method_scores(benchmark_dataset: datasets.Dataset, methods: List[str], score_types: List[str], artifacts: dict, max_rows=None) -> FullResult: bench_scores = {} averages_by_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) averages_by_block_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) average_times = defaultdict(list) markdown_by_method = defaultdict(dict) total_rows = len(benchmark_dataset) if max_rows: total_rows = min(max_rows, total_rows) for idx, sample in tqdm(enumerate(benchmark_dataset), desc="Running benchmark", total=total_rows): if max_rows is not None and idx >= max_rows: break doc_type = sample["classification"] gt_cls = METHOD_REGISTRY["gt"] gt_blocks = json.loads(sample["gt_blocks"]) gt_md = gt_cls(**artifacts)(sample)["markdown"] markdown_by_method[idx]["gt"] = gt_md out_data = defaultdict(dict) try: for method in methods: method_cls = METHOD_REGISTRY[method](**artifacts) method_info = method_cls(sample) method_md = method_info["markdown"] if method_md is None: method_md = "" # Avoid None values average_times[method].append(method_info["time"]) markdown_by_method[idx][method] = method_md for score_type in score_types: score_cls = SCORE_REGISTRY[score_type]() try: scores = score_cls(sample, gt_md, method_md) except Exception as e: # Some scorers can fail, like the LLM one print(f"Failed to score {method} with {score_type}: {e}") continue out_data[method][score_type] = scores averages_by_type[method][score_type][doc_type].append(scores["score"]) if "by_block" in scores["specific_scores"]: # Not all scorers support this for score, gt_block in zip(scores["specific_scores"]["by_block"], gt_blocks): averages_by_block_type[method][score_type][gt_block["block_type"]].append(score) except Exception as e: print(f"Failed to process {idx}: {e}") traceback.print_exc() if idx in markdown_by_method: del markdown_by_method[idx] continue bench_scores[idx] = out_data return { "scores": bench_scores, "markdown": markdown_by_method, "averages_by_type": averages_by_type, "averages_by_block_type": averages_by_block_type, "average_times": average_times, } @click.command(help="Benchmark PDF to MD conversion.") @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark") @click.option("--out_dataset", type=str, help="Path to the output dataset", default=None) @click.option("--methods", type=str, help="Comma separated list of other methods to compare against. Possible values: marker,mathpix,llamaparse,docling,mistral", default="marker") @click.option("--scores", type=str, help="Comma separated list of scoring functions to use. Possible values: heuristic,llm", default="heuristic") @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.") @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.") @click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.") @click.option("--languages", type=str, help="Comma separated list of languages to use for LLM", default=None) def main( dataset: str, out_dataset: str, methods: str, scores: str, result_path: str, max_rows: int, use_llm: bool, languages: str ): out_path = Path(result_path) out_path.mkdir(parents=True, exist_ok=True) methods = methods.split(",") for method in methods: if method not in METHOD_REGISTRY: raise ValueError(f"Method {method} not allowed. Allowed methods are {METHOD_REGISTRY.keys()}") # Ensure marker is always first all_methods = list(set(methods)) methods = ["marker"] if "marker" in all_methods else [] methods += [m for m in all_methods if m != "marker"] score_types = scores.split(",") for score_type in score_types: if score_type not in SCORE_REGISTRY: raise ValueError(f"Score type {score_type} not allowed. Allowed types are {SCORE_REGISTRY.keys()}") if languages: languages = languages.split(",") else: languages = None benchmark_dataset = datasets.load_dataset(dataset, split="train") if languages: benchmark_dataset = benchmark_dataset.filter(lambda x: x["language"] in languages) artifacts = { "model_dict": create_model_dict(), "use_llm": use_llm, "mathpix_ds": None, "llamaparse_ds": None, } if "mathpix" in methods: artifacts["mathpix_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_mathpix", split="train") if "llamaparse" in methods: artifacts["llamaparse_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_llamaparse", split="train") if "mistral" in methods: artifacts["mistral_ds"] = datasets.load_dataset("datalab-to/marker_benchmark_mistral", split="train") if "olmocr" in methods: from transformers import AutoProcessor, Qwen2VLForConditionalGeneration model = Qwen2VLForConditionalGeneration.from_pretrained("allenai/olmOCR-7B-0225-preview", torch_dtype=torch.bfloat16).eval() processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) artifacts["olmocr_model"] = {"model": model, "processor": processor} print(f"Running benchmark with methods: {methods} and scores: {score_types}") result = get_method_scores(benchmark_dataset, methods, score_types, artifacts, max_rows=max_rows) # Display benchmark scoring tables print_scores(result, out_path, methods, score_types, default_method=methods[0], default_score_type=score_types[0]) # Write to json with open(out_path / "result.json", "w") as f: json.dump(result, f) if out_dataset: if use_llm: out_dataset += "_llm" dataset = build_dataset(benchmark_dataset, result, score_types, max_rows=max_rows) dataset.push_to_hub(out_dataset, private=True) if __name__ == "__main__": main() ``` -------------------------------------------------------------------------------- /marker/util.py: -------------------------------------------------------------------------------- ```python import inspect import os from importlib import import_module from typing import List, Annotated import re import numpy as np import requests from pydantic import BaseModel from marker.schema.polygon import PolygonBox from marker.settings import settings OPENING_TAG_REGEX = re.compile(r"<((?:math|i|b))(?:\s+[^>]*)?>") CLOSING_TAG_REGEX = re.compile(r"</((?:math|i|b))>") TAG_MAPPING = { 'i': 'italic', 'b': 'bold', 'math': 'math', 'mark': 'highlight', 'sub': 'subscript', 'sup': 'superscript', 'small': 'small', 'u': 'underline', 'code': 'code' } def strings_to_classes(items: List[str]) -> List[type]: classes = [] for item in items: module_name, class_name = item.rsplit('.', 1) module = import_module(module_name) classes.append(getattr(module, class_name)) return classes def classes_to_strings(items: List[type]) -> List[str]: for item in items: if not inspect.isclass(item): raise ValueError(f"Item {item} is not a class") return [f"{item.__module__}.{item.__name__}" for item in items] def verify_config_keys(obj): annotations = inspect.get_annotations(obj.__class__) none_vals = "" for attr_name, annotation in annotations.items(): if isinstance(annotation, type(Annotated[str, ""])): value = getattr(obj, attr_name) if value is None: none_vals += f"{attr_name}, " assert len(none_vals) == 0, f"In order to use {obj.__class__.__name__}, you must set the configuration values `{none_vals}`." def assign_config(cls, config: BaseModel | dict | None): cls_name = cls.__class__.__name__ if config is None: return elif isinstance(config, BaseModel): dict_config = config.dict() elif isinstance(config, dict): dict_config = config else: raise ValueError("config must be a dict or a pydantic BaseModel") for k in dict_config: if hasattr(cls, k): setattr(cls, k, dict_config[k]) for k in dict_config: if cls_name not in k: continue # Enables using class-specific keys, like "MarkdownRenderer_remove_blocks" split_k = k.removeprefix(cls_name + "_") if hasattr(cls, split_k): setattr(cls, split_k, dict_config[k]) def parse_range_str(range_str: str) -> List[int]: range_lst = range_str.split(",") page_lst = [] for i in range_lst: if "-" in i: start, end = i.split("-") page_lst += list(range(int(start), int(end) + 1)) else: page_lst.append(int(i)) page_lst = sorted(list(set(page_lst))) # Deduplicate page numbers and sort in order return page_lst def matrix_intersection_area(boxes1: List[List[float]], boxes2: List[List[float]]) -> np.ndarray: if len(boxes1) == 0 or len(boxes2) == 0: return np.zeros((len(boxes1), len(boxes2))) boxes1 = np.array(boxes1) boxes2 = np.array(boxes2) boxes1 = boxes1[:, np.newaxis, :] # Shape: (N, 1, 4) boxes2 = boxes2[np.newaxis, :, :] # Shape: (1, M, 4) min_x = np.maximum(boxes1[..., 0], boxes2[..., 0]) # Shape: (N, M) min_y = np.maximum(boxes1[..., 1], boxes2[..., 1]) max_x = np.minimum(boxes1[..., 2], boxes2[..., 2]) max_y = np.minimum(boxes1[..., 3], boxes2[..., 3]) width = np.maximum(0, max_x - min_x) height = np.maximum(0, max_y - min_y) return width * height # Shape: (N, M) def matrix_distance(boxes1: List[List[float]], boxes2: List[List[float]]) -> np.ndarray: if len(boxes2) == 0: return np.zeros((len(boxes1), 0)) if len(boxes1) == 0: return np.zeros((0, len(boxes2))) boxes1 = np.array(boxes1) # Shape: (N, 4) boxes2 = np.array(boxes2) # Shape: (M, 4) boxes1_centers = (boxes1[:, :2] + boxes1[:, 2:]) / 2 # Shape: (M, 2) boxes2_centers = (boxes2[:, :2] + boxes2[:, 2:]) / 2 # Shape: (M, 2) boxes1_centers = boxes1_centers[:, np.newaxis, :] # Shape: (N, 1, 2) boxes2_centers = boxes2_centers[np.newaxis, :, :] # Shape: (1, M, 2) distances = np.linalg.norm(boxes1_centers - boxes2_centers, axis=2) # Shape: (N, M) return distances def sort_text_lines(lines: List[PolygonBox], tolerance=1.25): # Sorts in reading order. Not 100% accurate, this should only # be used as a starting point for more advanced sorting. vertical_groups = {} for line in lines: group_key = round(line.bbox[1] / tolerance) * tolerance if group_key not in vertical_groups: vertical_groups[group_key] = [] vertical_groups[group_key].append(line) # Sort each group horizontally and flatten the groups into a single list sorted_lines = [] for _, group in sorted(vertical_groups.items()): sorted_group = sorted(group, key=lambda x: x.bbox[0]) sorted_lines.extend(sorted_group) return sorted_lines def download_font(): if not os.path.exists(settings.FONT_PATH): os.makedirs(os.path.dirname(settings.FONT_PATH), exist_ok=True) font_dl_path = f"{settings.ARTIFACT_URL}/{settings.FONT_NAME}" with requests.get(font_dl_path, stream=True) as r, open(settings.FONT_PATH, 'wb') as f: r.raise_for_status() for chunk in r.iter_content(chunk_size=8192): f.write(chunk) def get_opening_tag_type(tag): """ Determines if a tag is an opening tag and extracts the tag type. Args: tag (str): The tag string to analyze. Returns: tuple: (is_opening_tag (bool), tag_type (str or None)) """ match = OPENING_TAG_REGEX.match(tag) if match: tag_type = match.group(1) if tag_type in TAG_MAPPING: return True, TAG_MAPPING[tag_type] return False, None def get_closing_tag_type(tag): """ Determines if a tag is an opening tag and extracts the tag type. Args: tag (str): The tag string to analyze. Returns: tuple: (is_opening_tag (bool), tag_type (str or None)) """ match = CLOSING_TAG_REGEX.match(tag) if match: tag_type = match.group(1) if tag_type in TAG_MAPPING: return True, TAG_MAPPING[tag_type] return False, None # Modification of unwrap_math from surya.recognition MATH_SYMBOLS = ["^", "_", "\\", "{", "}"] MATH_TAG_PATTERN = re.compile(r'<math\b[^>]*>.*?</math>', re.DOTALL) LATEX_ESCAPES = { r'\%': '%', r'\$': '$', r'\_': '_', r'\&': '&', r'\#': '#', r'\‰': '‰', } def normalize_latex_escapes(s: str) -> str: for k, v in LATEX_ESCAPES.items(): s = s.replace(k, v) return s def unwrap_math(text: str, math_symbols: List[str] = MATH_SYMBOLS) -> str: """Unwrap a single <math>...</math> block if it's not really math.""" if MATH_TAG_PATTERN.match(text): # Remove tags inner = re.sub(r'^\s*<math\b[^>]*>|</math>\s*$', '', text, flags=re.DOTALL) # Strip a single leading/trailing \\ plus surrounding whitespace inner_stripped = re.sub(r'^\s*\\\\\s*|\s*\\\\\s*$', '', inner) # Unwrap \text{...} unwrapped = re.sub(r'\\text[a-zA-Z]*\s*\{(.*?)\}', r'\1', inner_stripped) # Normalize escapes normalized = normalize_latex_escapes(unwrapped) # If no math symbols remain → unwrap fully if not any(symb in normalized for symb in math_symbols): return normalized.strip() # Otherwise, return as-is return text ``` -------------------------------------------------------------------------------- /marker/processors/debug.py: -------------------------------------------------------------------------------- ```python import json import os from typing import Annotated from PIL import Image, ImageDraw, ImageFont from marker.logger import get_logger from marker.processors import BaseProcessor from marker.schema import BlockTypes from marker.schema.document import Document from marker.settings import settings logger = get_logger() class DebugProcessor(BaseProcessor): """ A processor for debugging the document. """ block_types: Annotated[ tuple, "The block types to process.", "Default is an empty tuple." ] = tuple() debug_data_folder: Annotated[ str, "The folder to dump debug data to.", ] = "debug_data" debug_layout_images: Annotated[ bool, "Whether to dump layout debug images.", ] = False debug_pdf_images: Annotated[ bool, "Whether to dump PDF debug images.", ] = False debug_json: Annotated[ bool, "Whether to dump block debug data.", ] = False def __call__(self, document: Document): # Remove extension from doc name doc_base = os.path.basename(document.filepath).rsplit(".", 1)[0] self.debug_folder = os.path.join(self.debug_data_folder, doc_base) if any([self.debug_layout_images, self.debug_pdf_images, self.debug_json]): os.makedirs(self.debug_folder, exist_ok=True) document.debug_data_path = self.debug_folder if self.debug_layout_images: self.draw_layout_debug_images(document) logger.info(f"Dumped layout debug images to {self.debug_data_folder}") if self.debug_pdf_images: self.draw_pdf_debug_images(document) logger.info(f"Dumped PDF debug images to {self.debug_data_folder}") if self.debug_json: self.dump_block_debug_data(document) logger.info(f"Dumped block debug data to {self.debug_data_folder}") def draw_pdf_debug_images(self, document: Document): for page in document.pages: png_image = page.get_image(highres=True).copy() line_bboxes = [] span_bboxes = [] line_ids = [] for child in page.children: # Skip any blocks that have been removed if child.removed: continue if child.block_type == BlockTypes.Line: bbox = child.polygon.rescale(page.polygon.size, png_image.size).bbox line_bboxes.append(bbox) line_ids.append(child.block_id) elif child.block_type == BlockTypes.Span: bbox = child.polygon.rescale(page.polygon.size, png_image.size).bbox span_bboxes.append(bbox) self.render_on_image( line_bboxes, png_image, color="blue", draw_bbox=True, label_font_size=24, labels=[str(i) for i in line_ids], ) png_image = self.render_layout_boxes(page, png_image) debug_file = os.path.join(self.debug_folder, f"pdf_page_{page.page_id}.png") png_image.save(debug_file) def draw_layout_debug_images(self, document: Document, pdf_mode=False): for page in document.pages: img_size = page.get_image(highres=True).size png_image = Image.new("RGB", img_size, color="white") line_bboxes = [] line_text = [] for child in page.children: if child.removed: continue if child.block_type != BlockTypes.Line: continue bbox = child.polygon.rescale(page.polygon.size, img_size).bbox line_bboxes.append(bbox) line_text.append(child.raw_text(document)) self.render_on_image( line_bboxes, png_image, labels=line_text, color="black", draw_bbox=False, label_font_size=24, ) png_image = self.render_layout_boxes(page, png_image) debug_file = os.path.join( self.debug_folder, f"layout_page_{page.page_id}.png" ) png_image.save(debug_file) def render_layout_boxes(self, page, png_image): layout_bboxes = [] layout_labels = [] for block_id in page.structure: child = page.get_block(block_id) if child.block_type in [BlockTypes.Line, BlockTypes.Span]: continue bbox = child.polygon.rescale(page.polygon.size, png_image.size).bbox layout_bboxes.append(bbox) layout_labels.append(str(child.block_type)) self.render_on_image( layout_bboxes, png_image, labels=layout_labels, color="red", label_font_size=24, ) order_labels = [str(i) for i in range(len(layout_bboxes))] self.render_on_image( layout_bboxes, png_image, labels=order_labels, color="green", draw_bbox=False, label_offset=5, label_font_size=24, ) return png_image def dump_block_debug_data(self, document: Document): debug_file = os.path.join(self.debug_folder, "blocks.json") debug_data = [] for page in document.pages: page_data = page.model_dump( exclude={ "lowres_image": True, "highres_image": True, "children": { "__all__": {"lowres_image": True, "highres_image": True} }, } ) debug_data.append(page_data) with open(debug_file, "w+") as f: json.dump(debug_data, f) def get_text_size(self, text, font): im = Image.new(mode="P", size=(0, 0)) draw = ImageDraw.Draw(im) _, _, width, height = draw.textbbox((0, 0), text=text, font=font) return width, height def render_on_image( self, bboxes, image, labels=None, label_offset=1, label_font_size=10, color: str | list = "red", draw_bbox=True, ): draw = ImageDraw.Draw(image) font_path = settings.FONT_PATH label_font = ImageFont.truetype(font_path, label_font_size) for i, bbox in enumerate(bboxes): bbox = [int(p) for p in bbox] if draw_bbox: draw.rectangle( bbox, outline=color[i] if isinstance(color, list) else color, width=1, ) if labels is not None: label = labels[i] text_position = (bbox[0] + label_offset, bbox[1] + label_offset) text_size = self.get_text_size(label, label_font) if text_size[0] <= 0 or text_size[1] <= 0: continue box_position = ( text_position[0], text_position[1], text_position[0] + text_size[0], text_position[1] + text_size[1], ) draw.rectangle(box_position, fill="white") draw.text( text_position, label, fill=color[i] if isinstance(color, list) else color, font=label_font, ) return image ``` -------------------------------------------------------------------------------- /marker/converters/pdf.py: -------------------------------------------------------------------------------- ```python import os from marker.schema.document import Document os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning from collections import defaultdict from typing import Annotated, Any, Dict, List, Optional, Type, Tuple, Union import io from contextlib import contextmanager import tempfile from marker.processors import BaseProcessor from marker.services import BaseService from marker.processors.llm.llm_table_merge import LLMTableMergeProcessor from marker.providers.registry import provider_from_filepath from marker.builders.document import DocumentBuilder from marker.builders.layout import LayoutBuilder from marker.builders.line import LineBuilder from marker.builders.ocr import OcrBuilder from marker.builders.structure import StructureBuilder from marker.converters import BaseConverter from marker.processors.blockquote import BlockquoteProcessor from marker.processors.code import CodeProcessor from marker.processors.debug import DebugProcessor from marker.processors.document_toc import DocumentTOCProcessor from marker.processors.equation import EquationProcessor from marker.processors.footnote import FootnoteProcessor from marker.processors.ignoretext import IgnoreTextProcessor from marker.processors.line_numbers import LineNumbersProcessor from marker.processors.list import ListProcessor from marker.processors.llm.llm_complex import LLMComplexRegionProcessor from marker.processors.llm.llm_form import LLMFormProcessor from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor from marker.processors.llm.llm_table import LLMTableProcessor from marker.processors.page_header import PageHeaderProcessor from marker.processors.reference import ReferenceProcessor from marker.processors.sectionheader import SectionHeaderProcessor from marker.processors.table import TableProcessor from marker.processors.text import TextProcessor from marker.processors.block_relabel import BlockRelabelProcessor from marker.processors.blank_page import BlankPageProcessor from marker.processors.llm.llm_equation import LLMEquationProcessor from marker.renderers.markdown import MarkdownRenderer from marker.schema import BlockTypes from marker.schema.blocks import Block from marker.schema.registry import register_block_class from marker.util import strings_to_classes from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor from marker.processors.order import OrderProcessor from marker.services.gemini import GoogleGeminiService from marker.processors.line_merge import LineMergeProcessor from marker.processors.llm.llm_mathblock import LLMMathBlockProcessor from marker.processors.llm.llm_page_correction import LLMPageCorrectionProcessor from marker.processors.llm.llm_sectionheader import LLMSectionHeaderProcessor class PdfConverter(BaseConverter): """ A converter for processing and rendering PDF files into Markdown, JSON, HTML and other formats. """ override_map: Annotated[ Dict[BlockTypes, Type[Block]], "A mapping to override the default block classes for specific block types.", "The keys are `BlockTypes` enum values, representing the types of blocks,", "and the values are corresponding `Block` class implementations to use", "instead of the defaults.", ] = defaultdict() use_llm: Annotated[ bool, "Enable higher quality processing with LLMs.", ] = False default_processors: Tuple[BaseProcessor, ...] = ( OrderProcessor, BlockRelabelProcessor, LineMergeProcessor, BlockquoteProcessor, CodeProcessor, DocumentTOCProcessor, EquationProcessor, FootnoteProcessor, IgnoreTextProcessor, LineNumbersProcessor, ListProcessor, PageHeaderProcessor, SectionHeaderProcessor, TableProcessor, LLMTableProcessor, LLMTableMergeProcessor, LLMFormProcessor, TextProcessor, LLMComplexRegionProcessor, LLMImageDescriptionProcessor, LLMEquationProcessor, LLMHandwritingProcessor, LLMMathBlockProcessor, LLMSectionHeaderProcessor, LLMPageCorrectionProcessor, ReferenceProcessor, BlankPageProcessor, DebugProcessor, ) default_llm_service: BaseService = GoogleGeminiService def __init__( self, artifact_dict: Dict[str, Any], processor_list: Optional[List[str]] = None, renderer: str | None = None, llm_service: str | None = None, config=None, ): super().__init__(config) if config is None: config = {} for block_type, override_block_type in self.override_map.items(): register_block_class(block_type, override_block_type) if processor_list is not None: processor_list = strings_to_classes(processor_list) else: processor_list = self.default_processors if renderer: renderer = strings_to_classes([renderer])[0] else: renderer = MarkdownRenderer # Put here so that resolve_dependencies can access it self.artifact_dict = artifact_dict if llm_service: llm_service_cls = strings_to_classes([llm_service])[0] llm_service = self.resolve_dependencies(llm_service_cls) elif config.get("use_llm", False): llm_service = self.resolve_dependencies(self.default_llm_service) # Inject llm service into artifact_dict so it can be picked up by processors, etc. self.artifact_dict["llm_service"] = llm_service self.llm_service = llm_service self.renderer = renderer processor_list = self.initialize_processors(processor_list) self.processor_list = processor_list self.layout_builder_class = LayoutBuilder self.page_count = None # Track how many pages were converted @contextmanager def filepath_to_str(self, file_input: Union[str, io.BytesIO]): temp_file = None try: if isinstance(file_input, str): yield file_input else: with tempfile.NamedTemporaryFile( delete=False, suffix=".pdf" ) as temp_file: if isinstance(file_input, io.BytesIO): file_input.seek(0) temp_file.write(file_input.getvalue()) else: raise TypeError( f"Expected str or BytesIO, got {type(file_input)}" ) yield temp_file.name finally: if temp_file is not None and os.path.exists(temp_file.name): os.unlink(temp_file.name) def build_document(self, filepath: str) -> Document: provider_cls = provider_from_filepath(filepath) layout_builder = self.resolve_dependencies(self.layout_builder_class) line_builder = self.resolve_dependencies(LineBuilder) ocr_builder = self.resolve_dependencies(OcrBuilder) provider = provider_cls(filepath, self.config) document = DocumentBuilder(self.config)( provider, layout_builder, line_builder, ocr_builder ) structure_builder_cls = self.resolve_dependencies(StructureBuilder) structure_builder_cls(document) for processor in self.processor_list: processor(document) return document def __call__(self, filepath: str | io.BytesIO): with self.filepath_to_str(filepath) as temp_path: document = self.build_document(temp_path) self.page_count = len(document.pages) renderer = self.resolve_dependencies(self.renderer) rendered = renderer(document) return rendered ``` -------------------------------------------------------------------------------- /marker/schema/polygon.py: -------------------------------------------------------------------------------- ```python from __future__ import annotations import copy from typing import List import numpy as np from pydantic import BaseModel, field_validator, computed_field class PolygonBox(BaseModel): polygon: List[List[float]] @field_validator('polygon') @classmethod def check_elements(cls, v: List[List[float]]) -> List[List[float]]: if len(v) != 4: raise ValueError('corner must have 4 elements') for corner in v: if len(corner) != 2: raise ValueError('corner must have 2 elements') min_x = min([corner[0] for corner in v]) min_y = min([corner[1] for corner in v]) # Ensure corners are clockwise from top left corner_error = f" .Corners are {v}" assert v[2][1] >= min_y, f'bottom right corner should have a greater y value than top right corner' + corner_error assert v[3][1] >= min_y, 'bottom left corner should have a greater y value than top left corner' + corner_error assert v[1][0] >= min_x, 'top right corner should have a greater x value than top left corner' + corner_error assert v[2][0] >= min_x, 'bottom right corner should have a greater x value than bottom left corner' + corner_error return v @property def height(self): return self.bbox[3] - self.bbox[1] @property def width(self): return self.bbox[2] - self.bbox[0] @property def area(self): return self.width * self.height @property def center(self): return [(self.bbox[0] + self.bbox[2]) / 2, (self.bbox[1] + self.bbox[3]) / 2] @property def size(self): return [self.width, self.height] @property def x_start(self): return self.bbox[0] @property def y_start(self): return self.bbox[1] @property def x_end(self): return self.bbox[2] @property def y_end(self): return self.bbox[3] @computed_field @property def bbox(self) -> List[float]: min_x = min([corner[0] for corner in self.polygon]) min_y = min([corner[1] for corner in self.polygon]) max_x = max([corner[0] for corner in self.polygon]) max_y = max([corner[1] for corner in self.polygon]) return [min_x, min_y, max_x, max_y] def expand(self, x_margin: float, y_margin: float) -> PolygonBox: new_polygon = [] x_margin = x_margin * self.width y_margin = y_margin * self.height for idx, poly in enumerate(self.polygon): if idx == 0: new_polygon.append([poly[0] - x_margin, poly[1] - y_margin]) elif idx == 1: new_polygon.append([poly[0] + x_margin, poly[1] - y_margin]) elif idx == 2: new_polygon.append([poly[0] + x_margin, poly[1] + y_margin]) elif idx == 3: new_polygon.append([poly[0] - x_margin, poly[1] + y_margin]) return PolygonBox(polygon=new_polygon) def expand_y2(self, y_margin: float) -> PolygonBox: new_polygon = [] y_margin = y_margin * self.height for idx, poly in enumerate(self.polygon): if idx == 2: new_polygon.append([poly[0], poly[1] + y_margin]) elif idx == 3: new_polygon.append([poly[0], poly[1] + y_margin]) else: new_polygon.append(poly) return PolygonBox(polygon=new_polygon) def expand_y1(self, y_margin: float) -> PolygonBox: new_polygon = [] y_margin = y_margin * self.height for idx, poly in enumerate(self.polygon): if idx == 0: new_polygon.append([poly[0], poly[1] - y_margin]) elif idx == 1: new_polygon.append([poly[0], poly[1] - y_margin]) else: new_polygon.append(poly) return PolygonBox(polygon=new_polygon) def minimum_gap(self, other: PolygonBox): if self.intersection_pct(other) > 0: return 0 def dist(p1, p2): return ((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2) ** 0.5 left = other.bbox[2] < self.bbox[0] right = self.bbox[2] < other.bbox[0] bottom = other.bbox[3] < self.bbox[1] top = self.bbox[3] < other.bbox[1] if top and left: return dist((self.bbox[0], self.bbox[3]), (other.bbox[2], other.bbox[1])) elif left and bottom: return dist((self.bbox[0], self.bbox[1]), (other.bbox[2], other.bbox[3])) elif bottom and right: return dist((self.bbox[2], self.bbox[1]), (other.bbox[0], other.bbox[3])) elif right and top: return dist((self.bbox[2], self.bbox[3]), (other.bbox[0], other.bbox[1])) elif left: return self.bbox[0] - other.bbox[2] elif right: return other.bbox[0] - self.bbox[2] elif bottom: return self.bbox[1] - other.bbox[3] elif top: return other.bbox[1] - self.bbox[3] else: return 0 def center_distance(self, other: PolygonBox, x_weight: float = 1, y_weight: float = 1, absolute=False): if not absolute: return ((self.center[0] - other.center[0]) ** 2 * x_weight + (self.center[1] - other.center[1]) ** 2 * y_weight) ** 0.5 else: return abs(self.center[0] - other.center[0]) * x_weight + abs(self.center[1] - other.center[1]) * y_weight def tl_distance(self, other: PolygonBox): return ((self.bbox[0] - other.bbox[0]) ** 2 + (self.bbox[1] - other.bbox[1]) ** 2) ** 0.5 def rescale(self, old_size, new_size): # Point is in x, y format page_width, page_height = old_size img_width, img_height = new_size width_scaler = img_width / page_width height_scaler = img_height / page_height new_corners = copy.deepcopy(self.polygon) for corner in new_corners: corner[0] = corner[0] * width_scaler corner[1] = corner[1] * height_scaler return PolygonBox(polygon=new_corners) def fit_to_bounds(self, bounds): new_corners = copy.deepcopy(self.polygon) for corner in new_corners: corner[0] = max(min(corner[0], bounds[2]), bounds[0]) corner[1] = max(min(corner[1], bounds[3]), bounds[1]) return PolygonBox(polygon=new_corners) def overlap_x(self, other: PolygonBox): return max(0, min(self.bbox[2], other.bbox[2]) - max(self.bbox[0], other.bbox[0])) def overlap_y(self, other: PolygonBox): return max(0, min(self.bbox[3], other.bbox[3]) - max(self.bbox[1], other.bbox[1])) def intersection_area(self, other: PolygonBox): return self.overlap_x(other) * self.overlap_y(other) def intersection_pct(self, other: PolygonBox): if self.area == 0: return 0 intersection = self.intersection_area(other) return intersection / self.area def merge(self, others: List[PolygonBox]) -> PolygonBox: corners = [] for i in range(len(self.polygon)): x_coords = [self.polygon[i][0]] + [other.polygon[i][0] for other in others] y_coords = [self.polygon[i][1]] + [other.polygon[i][1] for other in others] min_x = min(x_coords) min_y = min(y_coords) max_x = max(x_coords) max_y = max(y_coords) if i == 0: corners.append([min_x, min_y]) elif i == 1: corners.append([max_x, min_y]) elif i == 2: corners.append([max_x, max_y]) elif i == 3: corners.append([min_x, max_y]) return PolygonBox(polygon=corners) @classmethod def from_bbox(cls, bbox: List[float], ensure_nonzero_area=False): if ensure_nonzero_area: bbox = list(bbox) bbox[2] = max(bbox[2], bbox[0] + 1) bbox[3] = max(bbox[3], bbox[1] + 1) return cls(polygon=[[bbox[0], bbox[1]], [bbox[2], bbox[1]], [bbox[2], bbox[3]], [bbox[0], bbox[3]]]) ``` -------------------------------------------------------------------------------- /marker/providers/powerpoint.py: -------------------------------------------------------------------------------- ```python import base64 import os import tempfile import traceback from marker.logger import get_logger from marker.providers.pdf import PdfProvider logger = get_logger() css = """ @page { size: A4 landscape; margin: 1.5cm; } table { width: 100%; border-collapse: collapse; break-inside: auto; font-size: 10pt; } tr { break-inside: avoid; page-break-inside: avoid; } td { border: 0.75pt solid #000; padding: 6pt; } img { max-width: 100%; height: auto; object-fit: contain; } """ class PowerPointProvider(PdfProvider): include_slide_number: bool = False def __init__(self, filepath: str, config=None): temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") self.temp_pdf_path = temp_pdf.name temp_pdf.close() # Convert PPTX to PDF try: self.convert_pptx_to_pdf(filepath) except Exception as e: print(traceback.format_exc()) raise ValueError(f"Error converting PPTX to PDF: {e}") # Initalize the PDF provider with the temp pdf path super().__init__(self.temp_pdf_path, config) def __del__(self): if os.path.exists(self.temp_pdf_path): os.remove(self.temp_pdf_path) def convert_pptx_to_pdf(self, filepath): from weasyprint import CSS, HTML from pptx import Presentation from pptx.enum.shapes import MSO_SHAPE_TYPE pptx = Presentation(filepath) html_parts = [] for slide_index, slide in enumerate(pptx.slides): html_parts.append("<section>") if self.include_slide_number: html_parts.append(f"<h2>Slide {slide_index + 1}</h2>") # Process shapes in the slide for shape in slide.shapes: # If shape is a group shape, we recursively handle all grouped shapes if shape.shape_type == MSO_SHAPE_TYPE.GROUP: html_parts.append(self._handle_group(shape)) continue # If shape is a table if shape.has_table: html_parts.append(self._handle_table(shape)) continue # If shape is a picture if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: html_parts.append(self._handle_image(shape)) continue # If shape has text if hasattr(shape, "text") and shape.text is not None: if shape.has_text_frame: # Distinguish placeholders (title, subtitle, etc.) html_parts.append(self._handle_text(shape)) else: html_parts.append(f"<p>{self._escape_html(shape.text)}</p>") html_parts.append("</section>") html = "\n".join(html_parts) # We convert the HTML into a PDF HTML(string=html).write_pdf( self.temp_pdf_path, stylesheets=[CSS(string=css), self.get_font_css()] ) def _handle_group(self, group_shape) -> str: """ Recursively handle shapes in a group. Returns HTML string for the entire group. """ from pptx.enum.shapes import MSO_SHAPE_TYPE group_parts = [] for shape in group_shape.shapes: if shape.shape_type == MSO_SHAPE_TYPE.GROUP: group_parts.append(self._handle_group(shape)) continue if shape.has_table: group_parts.append(self._handle_table(shape)) continue if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: group_parts.append(self._handle_image(shape)) continue if hasattr(shape, "text"): if shape.has_text_frame: group_parts.append(self._handle_text(shape)) else: group_parts.append(f"<p>{self._escape_html(shape.text)}</p>") return "".join(group_parts) def _handle_text(self, shape) -> str: """ Processes shape text, including bullet/numbered list detection and placeholders (title, subtitle, etc.). Returns HTML for the text block(s). """ from pptx.enum.shapes import PP_PLACEHOLDER # Distinguish placeholders to see if it's a title or subtitle label_html_tag = "p" if shape.is_placeholder: placeholder_type = shape.placeholder_format.type if placeholder_type in [PP_PLACEHOLDER.TITLE, PP_PLACEHOLDER.CENTER_TITLE]: label_html_tag = "h3" elif placeholder_type == PP_PLACEHOLDER.SUBTITLE: label_html_tag = "h4" # Keep track of whether we are currently in a <ul> or <ol> html_parts = [] list_open = False list_type = None # "ul" or "ol" for paragraph in shape.text_frame.paragraphs: p_el = paragraph._element # Check bullet bullet_char = p_el.find(".//a:buChar", namespaces=p_el.nsmap) bullet_num = p_el.find(".//a:buAutoNum", namespaces=p_el.nsmap) is_bullet = (bullet_char is not None) or (paragraph.level > 0) is_numbered = bullet_num is not None # If the paragraph is bullet or numbered if is_bullet or is_numbered: # Decide if we need to start a new list or continue an existing one current_list_type = "ol" if is_numbered else "ul" if not list_open: # Start new list_open = True list_type = current_list_type html_parts.append(f"<{list_type}>") elif list_open and list_type != current_list_type: # Close old list, start new html_parts.append(f"</{list_type}>") list_type = current_list_type html_parts.append(f"<{list_type}>") # Build the bullet (li) text from all runs in the paragraph p_text = "".join(run.text for run in paragraph.runs) if p_text: html_parts.append(f"<li>{self._escape_html(p_text)}</li>") else: # If we were in a list, we need to close it if list_open: html_parts.append(f"</{list_type}>") list_open = False list_type = None # Now it's just a normal paragraph # Gather the paragraph text from runs p_text = "".join(run.text for run in paragraph.runs) if p_text: # If we know it's a slide title, we can use <h3> or so html_parts.append( f"<{label_html_tag}>{self._escape_html(p_text)}</{label_html_tag}>" ) # If the text frame ended and we still have an open list, close it if list_open: html_parts.append(f"</{list_type}>") return "".join(html_parts) def _handle_image(self, shape) -> str: """ Embeds the image as a base64 <img> in HTML. """ image = shape.image image_bytes = image.blob try: img_str = base64.b64encode(image_bytes).decode("utf-8") return f"<img src='data:{image.content_type};base64,{img_str}' />" except Exception as e: logger.warning(f"Warning: image cannot be loaded by Pillow: {e}") return "" def _handle_table(self, shape) -> str: """ Renders a shape's table as an HTML <table>. """ table_html = [] table_html.append("<table border='1'>") for row in shape.table.rows: row_html = ["<tr>"] for cell in row.cells: row_html.append(f"<td>{self._escape_html(cell.text)}</td>") row_html.append("</tr>") table_html.append("".join(row_html)) table_html.append("</table>") return "".join(table_html) def _escape_html(self, text: str) -> str: """ Minimal escaping for HTML special characters. """ return ( text.replace("&", "&") .replace("<", "<") .replace(">", ">") .replace('"', """) .replace("'", "'") ) ``` -------------------------------------------------------------------------------- /benchmarks/overall/elo.py: -------------------------------------------------------------------------------- ```python import json import random import time import os from dataclasses import dataclass from typing import List, Dict, Tuple, Literal from PIL import Image from collections import defaultdict import tabulate import click import datasets from google import genai from google.genai.errors import APIError from pydantic import BaseModel from tqdm import tqdm from marker.settings import settings rating_prompt = """ You're a document analysis expert who is comparing two different markdown samples to an image to see which one represents the content of the image better. The markdown will be called version A and version B. Here are some notes on the image and markdown: - Some parts of the page may have been recognized as images and linked from the markdown, like ``. - Tables will be formatted as Github flavored markdown. - Block equations will be in LaTeX. - The image and markdown may be in any language. - The markdown is based on the text extracted from the document, and sometimes the document may have had bad OCR applied to it, resulting in gibberish text. The markdown should fully capture the meaning and formatting of the text in the image. You'll evaluate the markdown based on the image provided. **Instructions** Follow this process to evaluate the markdown: 1. Carefully examine the image. 2. Carefully examine the first markdown input provided. 3. Describe how well version a represents the image. 4. Carefully examine the second markdown input provided. 5. Describe how well version B represents the image. 6. Compare version A and version B. 7. Decide which markdown representation is better, based on the criteria below. Output version_a if version a is better, and version_b if version b is better. Use these criteria when judging the markdown: - Overall - the overall quality of the markdown as compared to the image. - Text quality - the quality of the text extraction from the image. - Formatting quality - the quality of the formatting applied to the markdown, as compared to the image. - Tables - how effectively the tables have been extracted and formatted. - Forms - how effectively the forms have extracted and formatted. - Equations - how effectively block equations have been converted to LaTeX. - Lists - if the lists have been properly extracted and formatted. - Images - if images are identified and placed correctly. Notes on scoring: - Perfect markdown will include all of the important text from the image, and the formatting will be correct (minor mistakes okay). It's okay to omit some text that isn't important to the meaning, like page numbers and chapter headings. If the entire page is an image, it's okay if the markdown is just a link to the image, unless the image would be better represented as text. - Bad markdown will have major missing text segments from the markdown or completely unreadable formatting. It may also have key values that are different from the values in the image. Output json, like in the example below. **Example** Version A ```markdown # *Section 1* This is some *markdown* extracted from a document. Here is a block equation: $$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 123}{t}$$ ``` Version B ```markdown # Section 1 This is some markdown extracted from a document. Here is a block equation: $$\frac{ab \cdot x^5 + x^2 + 2 \cdot x + 124}{t}$$ ``` Output ```json { "image_description": "In the image, there is a section header 'Section 1', followed by some text and a block equation.", "version_a_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation.", "version_b_description": "In the markdown, there is a section header 'Section 1', followed by some text and a block equation. The formatting in version b is slightly different from the image. The value 124 is also different from the image.", "comparison": "Version A is better than version B. The text and formatting in version A matches the image better than version B. Version B also has an incorrect value.", "winner": "version_a", } ``` **Input** Version A ```markdown {{version_a}} ``` Version B ```markdown {{version_b}} ``` **Output** """ class ComparerSchema(BaseModel): image_description: str version_a_description: str version_b_description: str comparison: str winner: Literal["version_a", "version_b"] class Comparer: def __init__(self): pass def __call__( self, img: Image.Image, version_a: str, version_b: str ) -> str | None: if version_a is None and version_b is not None: return "version_b" elif version_b is None and version_a is not None: return "version_a" hydrated_prompt = rating_prompt.replace("{{version_a}}", version_a).replace("{{version_b}}", version_b) try: rating = self.llm_rater(img, hydrated_prompt) except Exception as e: print(f"Error: {e}") return return rating def llm_rater(self, img: Image.Image, prompt: str): response = self.llm_response_wrapper( [img, prompt], ComparerSchema ) assert "winner" in response, f"Response missing 'winner' key: {response}" return response["winner"] def llm_response_wrapper( self, prompt, response_schema, ): client = genai.Client( http_options={"timeout": 60000}, vertexai=True, project=os.getenv("VERTEX_PROJECT_ID"), location=os.getenv("VERTEX_LOCATION"), ) try: responses = client.models.generate_content( model="gemini-2.0-flash-001", contents=prompt, config={ "temperature": 0, "response_schema": response_schema, "response_mime_type": "application/json", }, ) output = responses.candidates[0].content.parts[0].text return json.loads(output) except APIError as e: print(f"Hit Gemini rate limit") return except Exception as e: print(f"Error: {e}") return def display_win_rates_table(win_rates: dict): table = [] headers = ["Method A", "Method B", "Wins", "Losses", "Win %"] for method_a, method_b_dict in win_rates.items(): row = [method_a] for method_b, results in method_b_dict.items(): row = [method_a, method_b, results["win"], results["loss"], (results["win"] / (results["win"] + results["loss"])) * 100] table.append(row) print(tabulate.tabulate(table, headers=headers, tablefmt="pretty")) @click.command("Calculate win rates for document conversion methods") @click.argument("dataset", type=str) @click.option("--methods", type=str, help="List of methods to compare: comma separated like marker,mathpix") @click.option("--row_samples", type=int, default=2, help="Number of samples per row") @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process") def main( dataset: str, methods: str, row_samples: int, max_rows: int ): ds = datasets.load_dataset(dataset, split="train") method_lst = methods.split(",") win_rates = {m: defaultdict(lambda: defaultdict(int)) for m in method_lst} comparer = Comparer() max_rows = max_rows or len(ds) for i in tqdm(range(max_rows), desc="Calculating win rates..."): row = ds[i] # Avoid any bias in ordering random.shuffle(method_lst) for j, method_a in enumerate(method_lst[:-1]): for z, method_b in enumerate(method_lst[j:]): if method_a == method_b: continue method_a_md = row[f"{method_a}_md"] method_b_md = row[f"{method_b}_md"] winner = comparer(row["img"], method_a_md, method_b_md) if not winner: continue if winner == "version_a": win_rates[method_a][method_b]["win"] += 1 win_rates[method_b][method_a]["loss"] += 1 else: win_rates[method_b][method_a]["win"] += 1 win_rates[method_a][method_b]["loss"] += 1 if i % 10 == 0: display_win_rates_table(win_rates) display_win_rates_table(win_rates) if __name__ == "__main__": main() ``` -------------------------------------------------------------------------------- /marker/processors/llm/llm_mathblock.py: -------------------------------------------------------------------------------- ```python from concurrent.futures import ThreadPoolExecutor, as_completed from typing import List, Tuple, Annotated from pydantic import BaseModel from tqdm import tqdm from marker.output import json_to_html, unwrap_outer_tag from marker.processors.llm import BaseLLMComplexBlockProcessor from marker.schema import BlockTypes from marker.schema.blocks import Block, InlineMath from marker.schema.document import Document from marker.schema.groups import PageGroup class LLMMathBlockProcessor(BaseLLMComplexBlockProcessor): redo_inline_math: Annotated[ bool, "If True, the inline math will be re-done, otherwise it will be left as is.", ] = False inlinemath_min_ratio: Annotated[ float, "If more than this ratio of blocks are inlinemath blocks, assume everything has math.", ] = 0.4 block_types = (BlockTypes.TextInlineMath,) # Primary block type additional_block_types = ( BlockTypes.Text, BlockTypes.Caption, BlockTypes.SectionHeader, BlockTypes.Footnote, ) # Seconday, can also contain math text_math_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images. You will receive an image of a text block and extracted text corresponding to the text in the image. Your task is to correct any errors in the extracted text, including math, formatting, and other inaccuracies, and output the corrected block in html format. Stay as faithful to the text in the image as possible. **Instructions:** 1. Carefully examine the provided text block image . 2. Analyze the text that has been extracted from the block. 3. Compare the extracted text to the corresponding text in the image. 4. Write a short analysis of the text block, including any errors you see in the extracted text. 5. If there are no errors in any of the extracted text, output "No corrections needed". 6. Correct any errors in the extracted text, including: * Inline math: Ensure all mathematical expressions are correctly formatted and rendered. Surround them with <math>...</math> tags. The math expressions should be rendered in simple, concise, KaTeX-compatible LaTeX. Do not use $ or $$ as delimiters. * If a math expression is not in LaTeX format, convert it to LaTeX format, and surround it with <math>...</math> tags. * Formatting: Maintain consistent formatting with the text block image, including spacing, indentation, subscripts/superscripts, and special characters. Use the <i>, <b>, <sup>, <sub>, and <span> tags to format the text as needed. * Other inaccuracies: If the image is handwritten then you may correct any spelling errors, or other discrepancies. * Ensure lines wrap properly, and that newlines are not in the middle of sentences. 7. Do not remove any formatting i.e bold, italics, math, superscripts, subscripts, etc from the extracted text unless it is necessary to correct an error. 8. Output the corrected text in html format, as shown in the example below. Only use the p, math, br, a, i, b, sup, sub, and span tags. 9. You absolutely cannot remove any <a href='#...'>...</a> tags, those are extremely important for references and are coming directly from the document, you MUST always preserve them. **Example:** Input: ```html Adversarial training (AT) <a href='#page-9-1'>[23]</a>, which aims to minimize the model's risk under the worst-case perturbations, is currently the most effective approach for improving the robustness of deep neural networks. For a given neural network f(x, w) with parameters w, the optimization objective of AT can be formulated as follows: ``` Output: analysis: The inline math is not in LaTeX format and is not surrounded by <math>...</math> tags. ```html Adversarial training <i>(AT)</i> <a href='#page-9-1'>[23]</a>, which aims to minimize the model's risk under the worst-case perturbations, is currently the most effective approach for improving the robustness of deep neural networks. For a given neural network <math>f(x, w)</math> with parameters <math>w</math>, the optimization objective of AT can be formulated as follows: ``` **Input:** ```html {extracted_html} ``` """ def rewrite_blocks(self, document: Document): if not self.redo_inline_math: return # Get inline math blocks inline_blocks: List[InlineMath] = [ (page, block) for page in document.pages for block in page.contained_blocks(document, self.block_types) ] # Get other blocks with detected math in them detected_blocks = [ (page, block) for page in document.pages for block in page.contained_blocks( document, ( BlockTypes.Text, BlockTypes.Caption, BlockTypes.SectionHeader, BlockTypes.Footnote, BlockTypes.ListItem, ), ) if any( [ b.formats and "math" in b.formats for b in block.contained_blocks(document, (BlockTypes.Line,)) ] ) ] # If a page has enough math blocks, assume all blocks can contain math additional_text_blocks = [] for page in document.pages: # Check for inline math blocks page_inlinemath_blocks = [ im for im in inline_blocks if im[0].page_id == page.page_id ] page_detected_blocks = [ db for db in detected_blocks if db[0].page_id == page.page_id ] math_block_count = len(page_inlinemath_blocks) + len(page_detected_blocks) # Find all potential blocks additional_blocks = page.contained_blocks( document, self.additional_block_types + self.block_types ) # Check if the ratio of math blocks to additional blocks is high enough if ( math_block_count / max(1, len(additional_blocks)) < self.inlinemath_min_ratio ): continue for b in additional_blocks: if b not in detected_blocks and b not in inline_blocks: additional_text_blocks.append((page, b)) inference_blocks = inline_blocks + detected_blocks + additional_text_blocks # Don't show progress if there are no blocks to process total_blocks = len(inference_blocks) if total_blocks == 0: return pbar = tqdm( total=total_blocks, desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm ) with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor: for future in as_completed( [ executor.submit(self.process_rewriting, document, b[0], b[1]) for b in inference_blocks ] ): future.result() # Raise exceptions if any occurred pbar.update(1) pbar.close() def get_block_text(self, block: Block, document: Document) -> str: html = json_to_html(block.render(document)) html = unwrap_outer_tag(html) # Remove an outer p tag if it exists return html def get_block_lines(self, block: Block, document: Document) -> Tuple[list, list]: text_lines = block.contained_blocks(document, (BlockTypes.Line,)) extracted_lines = [line.formatted_text(document) for line in text_lines] return text_lines, extracted_lines def process_rewriting(self, document: Document, page: PageGroup, block: Block): block_text = self.get_block_text(block, document) prompt = self.text_math_rewriting_prompt.replace("{extracted_html}", block_text) image = self.extract_image(document, block) response = self.llm_service(prompt, image, block, LLMTextSchema) if not response or "corrected_html" not in response: block.update_metadata(llm_error_count=1) return corrected_html = response["corrected_html"] if not corrected_html: block.update_metadata(llm_error_count=1) return # Block is fine if "no corrections needed" in corrected_html.lower(): return if len(corrected_html) < len(block_text) * 0.6: block.update_metadata(llm_error_count=1) return block.html = corrected_html class LLMTextSchema(BaseModel): analysis: str corrected_html: str ``` -------------------------------------------------------------------------------- /signatures/version1/cla.json: -------------------------------------------------------------------------------- ```json { "signedContributors": [ { "name": "korakot", "id": 3155646, "comment_id": 2143359366, "created_at": "2024-06-01T08:25:52Z", "repoId": 712111618, "pullRequestNo": 161 }, { "name": "tosaddler", "id": 13705399, "comment_id": 2144014410, "created_at": "2024-06-02T20:40:52Z", "repoId": 712111618, "pullRequestNo": 165 }, { "name": "q2333gh", "id": 32679742, "comment_id": 2156122900, "created_at": "2024-06-08T18:01:39Z", "repoId": 712111618, "pullRequestNo": 176 }, { "name": "q2333gh", "id": 32679742, "comment_id": 2156614334, "created_at": "2024-06-09T13:48:49Z", "repoId": 712111618, "pullRequestNo": 176 }, { "name": "aniketinamdar", "id": 79044809, "comment_id": 2157453610, "created_at": "2024-06-10T06:43:39Z", "repoId": 712111618, "pullRequestNo": 179 }, { "name": "Zxilly", "id": 31370133, "comment_id": 2295163597, "created_at": "2024-08-18T07:44:04Z", "repoId": 712111618, "pullRequestNo": 257 }, { "name": "conscienceli", "id": 4034943, "comment_id": 2333374932, "created_at": "2024-09-06T06:56:25Z", "repoId": 712111618, "pullRequestNo": 276 }, { "name": "jcytong", "id": 156466, "comment_id": 2334987739, "created_at": "2024-09-07T01:27:51Z", "repoId": 712111618, "pullRequestNo": 278 }, { "name": "syldor", "id": 4158062, "comment_id": 2428114896, "created_at": "2024-10-22T03:05:34Z", "repoId": 712111618, "pullRequestNo": 309 }, { "name": "iammosespaulr", "id": 28682735, "comment_id": 2448054789, "created_at": "2024-10-30T18:36:24Z", "repoId": 712111618, "pullRequestNo": 327 }, { "name": "tjbck", "id": 25473318, "comment_id": 2454051305, "created_at": "2024-11-04T08:09:46Z", "repoId": 712111618, "pullRequestNo": 332 }, { "name": "jonaskahn", "id": 4338500, "comment_id": 2516932772, "created_at": "2024-12-04T10:47:00Z", "repoId": 712111618, "pullRequestNo": 413 }, { "name": "Leon-Sander", "id": 72946124, "comment_id": 2519736628, "created_at": "2024-12-05T09:26:56Z", "repoId": 712111618, "pullRequestNo": 415 }, { "name": "aguadoenzo", "id": 11411723, "comment_id": 2520293935, "created_at": "2024-12-05T13:13:34Z", "repoId": 712111618, "pullRequestNo": 416 }, { "name": "tarun-menta", "id": 66506307, "comment_id": 2543907406, "created_at": "2024-12-15T15:06:32Z", "repoId": 712111618, "pullRequestNo": 427 }, { "name": "ZeyuTeng96", "id": 96521059, "comment_id": 2567236036, "created_at": "2025-01-02T02:36:02Z", "repoId": 712111618, "pullRequestNo": 452 }, { "name": "xiaoyao9184", "id": 6614349, "comment_id": 2571623521, "created_at": "2025-01-05T13:15:34Z", "repoId": 712111618, "pullRequestNo": 463 }, { "name": "yasyf", "id": 709645, "comment_id": 2571679069, "created_at": "2025-01-05T16:23:12Z", "repoId": 712111618, "pullRequestNo": 464 }, { "name": "jazzido", "id": 27584, "comment_id": 2610428000, "created_at": "2025-01-23T17:01:02Z", "repoId": 712111618, "pullRequestNo": 502 }, { "name": "tagliala", "id": 556268, "comment_id": 2614522545, "created_at": "2025-01-26T17:44:13Z", "repoId": 712111618, "pullRequestNo": 507 }, { "name": "conjuncts", "id": 67614673, "comment_id": 2646947362, "created_at": "2025-02-10T05:22:00Z", "repoId": 712111618, "pullRequestNo": 533 }, { "name": "sleiss", "id": 5084100, "comment_id": 2661051259, "created_at": "2025-02-15T19:11:32Z", "repoId": 712111618, "pullRequestNo": 552 }, { "name": "dantetemplar", "id": 69670642, "comment_id": 2661665606, "created_at": "2025-02-16T23:02:34Z", "repoId": 712111618, "pullRequestNo": 555 }, { "name": "vicenciomf2", "id": 127889973, "comment_id": 2676007412, "created_at": "2025-02-22T04:34:27Z", "repoId": 712111618, "pullRequestNo": 574 }, { "name": "denisshepelin", "id": 143056, "comment_id": 2694298068, "created_at": "2025-03-03T12:50:11Z", "repoId": 712111618, "pullRequestNo": 592 }, { "name": "lecifire", "id": 43193080, "comment_id": 2735460895, "created_at": "2025-03-19T06:25:13Z", "repoId": 712111618, "pullRequestNo": 609 }, { "name": "eddiedozier", "id": 9558671, "comment_id": 2774087436, "created_at": "2025-04-03T01:14:41Z", "repoId": 712111618, "pullRequestNo": 646 }, { "name": "MauritsBrinkman", "id": 153435669, "comment_id": 2838830143, "created_at": "2025-04-29T13:10:38Z", "repoId": 712111618, "pullRequestNo": 675 }, { "name": "jacksontromero", "id": 42822986, "comment_id": 2927811267, "created_at": "2025-06-01T20:10:35Z", "repoId": 712111618, "pullRequestNo": 721 }, { "name": "rgeorgi", "id": 805862, "comment_id": 2968063099, "created_at": "2025-06-12T20:23:50Z", "repoId": 712111618, "pullRequestNo": 751 }, { "name": "zanussbaum", "id": 33707069, "comment_id": 3006162493, "created_at": "2025-06-25T21:12:25Z", "repoId": 712111618, "pullRequestNo": 769 }, { "name": "runarmod", "id": 40912933, "comment_id": 3007582739, "created_at": "2025-06-26T08:06:45Z", "repoId": 712111618, "pullRequestNo": 772 }, { "name": "ArnoKlein", "id": 64474825, "comment_id": 3044904172, "created_at": "2025-07-07T12:41:57Z", "repoId": 712111618, "pullRequestNo": 793 }, { "name": "voberoi", "id": 26339, "comment_id": 3054261379, "created_at": "2025-07-09T22:17:58Z", "repoId": 712111618, "pullRequestNo": 796 }, { "name": "MrWaradana", "id": 77272854, "comment_id": 3060610542, "created_at": "2025-07-11T05:30:30Z", "repoId": 712111618, "pullRequestNo": 799 }, { "name": "schen1102", "id": 221293366, "comment_id": 3083001869, "created_at": "2025-07-17T07:50:39Z", "repoId": 712111618, "pullRequestNo": 810 }, { "name": "igormf", "id": 8070635, "comment_id": 3156380190, "created_at": "2025-08-05T19:40:17Z", "repoId": 712111618, "pullRequestNo": 823 }, { "name": "muratcanlaloglu", "id": 71566447, "comment_id": 3189653464, "created_at": "2025-08-14T19:34:21Z", "repoId": 712111618, "pullRequestNo": 833 }, { "name": "SageStack", "id": 124574940, "comment_id": 3192827940, "created_at": "2025-08-15T21:40:27Z", "repoId": 712111618, "pullRequestNo": 837 }, { "name": "dowithless", "id": 165774507, "comment_id": 3192986184, "created_at": "2025-08-15T23:20:17Z", "repoId": 712111618, "pullRequestNo": 839 }, { "name": "blap", "id": 913963, "comment_id": 3036951999, "created_at": "2025-07-04T17:42:05Z", "repoId": 712111618, "pullRequestNo": 789 }, { "name": "sandy0kwon", "id": 78377296, "comment_id": 3207667429, "created_at": "2025-08-20T18:59:07Z", "repoId": 712111618, "pullRequestNo": 843 }, { "name": "TwinkleXD", "id": 68148987, "comment_id": 3218116522, "created_at": "2025-08-24T13:49:14Z", "repoId": 712111618, "pullRequestNo": 849 }, { "name": "u-ashish", "id": 14264791, "comment_id": 3221349115, "created_at": "2025-08-25T18:41:28Z", "repoId": 712111618, "pullRequestNo": 850 }, { "name": "EdmondChuiHW", "id": 1967998, "comment_id": 3254531992, "created_at": "2025-09-04T16:30:48Z", "repoId": 712111618, "pullRequestNo": 869 }, { "name": "himanshu-josh", "id": 216880840, "comment_id": 3311497351, "created_at": "2025-09-19T09:48:31Z", "repoId": 712111618, "pullRequestNo": 891 }, { "name": "aud", "id": 13060190, "comment_id": 3331451358, "created_at": "2025-09-25T02:21:46Z", "repoId": 712111618, "pullRequestNo": 895 }, { "name": "Tenkeboks", "id": 198454519, "comment_id": 3338897497, "created_at": "2025-09-26T14:05:43Z", "repoId": 712111618, "pullRequestNo": 897 }, { "name": "ozzzzz", "id": 5720160, "comment_id": 3363582442, "created_at": "2025-10-02T23:18:12Z", "repoId": 712111618, "pullRequestNo": 905 }, { "name": "Vidarshana26204", "id": 36869394, "comment_id": 3394829315, "created_at": "2025-10-12T16:26:13Z", "repoId": 712111618, "pullRequestNo": 916 } ] } ``` -------------------------------------------------------------------------------- /marker/renderers/markdown.py: -------------------------------------------------------------------------------- ```python import re from collections import defaultdict from typing import Annotated, Tuple import regex import six from bs4 import NavigableString from markdownify import MarkdownConverter, re_whitespace from marker.logger import get_logger from pydantic import BaseModel from marker.renderers.html import HTMLRenderer from marker.schema import BlockTypes from marker.schema.document import Document logger = get_logger() def escape_dollars(text): return text.replace("$", r"\$") def cleanup_text(full_text): full_text = re.sub(r"\n{3,}", "\n\n", full_text) full_text = re.sub(r"(\n\s){3,}", "\n\n", full_text) return full_text.strip() def get_formatted_table_text(element): text = [] for content in element.contents: if content is None: continue if isinstance(content, NavigableString): stripped = content.strip() if stripped: text.append(escape_dollars(stripped)) elif content.name == "br": text.append("<br>") elif content.name == "math": text.append("$" + content.text + "$") else: content_str = escape_dollars(str(content)) text.append(content_str) full_text = "" for i, t in enumerate(text): if t == "<br>": full_text += t elif i > 0 and text[i - 1] != "<br>": full_text += " " + t else: full_text += t return full_text class Markdownify(MarkdownConverter): def __init__( self, paginate_output, page_separator, inline_math_delimiters, block_math_delimiters, html_tables_in_markdown, **kwargs, ): super().__init__(**kwargs) self.paginate_output = paginate_output self.page_separator = page_separator self.inline_math_delimiters = inline_math_delimiters self.block_math_delimiters = block_math_delimiters self.html_tables_in_markdown = html_tables_in_markdown def convert_div(self, el, text, parent_tags): is_page = el.has_attr("class") and el["class"][0] == "page" if self.paginate_output and is_page: page_id = el["data-page-id"] pagination_item = ( "\n\n" + "{" + str(page_id) + "}" + self.page_separator + "\n\n" ) return pagination_item + text else: return text def convert_p(self, el, text, parent_tags): hyphens = r"-—¬" has_continuation = el.has_attr("class") and "has-continuation" in el["class"] if has_continuation: block_type = BlockTypes[el["block-type"]] if block_type in [BlockTypes.TextInlineMath, BlockTypes.Text]: if regex.compile( rf".*[\p{{Ll}}|\d][{hyphens}]\s?$", regex.DOTALL ).match(text): # handle hypenation across pages return regex.split(rf"[{hyphens}]\s?$", text)[0] return f"{text} " if block_type == BlockTypes.ListGroup: return f"{text}" return f"{text}\n\n" if text else "" # default convert_p behavior def convert_math(self, el, text, parent_tags): block = el.has_attr("display") and el["display"] == "block" if block: return ( "\n" + self.block_math_delimiters[0] + text.strip() + self.block_math_delimiters[1] + "\n" ) else: return ( " " + self.inline_math_delimiters[0] + text.strip() + self.inline_math_delimiters[1] + " " ) def convert_table(self, el, text, parent_tags): if self.html_tables_in_markdown: return "\n\n" + str(el) + "\n\n" total_rows = len(el.find_all("tr")) colspans = [] rowspan_cols = defaultdict(int) for i, row in enumerate(el.find_all("tr")): row_cols = rowspan_cols[i] for cell in row.find_all(["td", "th"]): colspan = int(cell.get("colspan", 1)) row_cols += colspan for r in range(int(cell.get("rowspan", 1)) - 1): rowspan_cols[i + r] += ( colspan # Add the colspan to the next rows, so they get the correct number of columns ) colspans.append(row_cols) total_cols = max(colspans) if colspans else 0 grid = [[None for _ in range(total_cols)] for _ in range(total_rows)] for row_idx, tr in enumerate(el.find_all("tr")): col_idx = 0 for cell in tr.find_all(["td", "th"]): # Skip filled positions while col_idx < total_cols and grid[row_idx][col_idx] is not None: col_idx += 1 # Fill in grid value = ( get_formatted_table_text(cell) .replace("\n", " ") .replace("|", " ") .strip() ) rowspan = int(cell.get("rowspan", 1)) colspan = int(cell.get("colspan", 1)) if col_idx >= total_cols: # Skip this cell if we're out of bounds continue for r in range(rowspan): for c in range(colspan): try: if r == 0 and c == 0: grid[row_idx][col_idx] = value else: grid[row_idx + r][col_idx + c] = ( "" # Empty cell due to rowspan/colspan ) except IndexError: # Sometimes the colspan/rowspan predictions can overflow logger.info( f"Overflow in columns: {col_idx + c} >= {total_cols} or rows: {row_idx + r} >= {total_rows}" ) continue col_idx += colspan markdown_lines = [] col_widths = [0] * total_cols for row in grid: for col_idx, cell in enumerate(row): if cell is not None: col_widths[col_idx] = max(col_widths[col_idx], len(str(cell))) def add_header_line(): markdown_lines.append( "|" + "|".join("-" * (width + 2) for width in col_widths) + "|" ) # Generate markdown rows added_header = False for i, row in enumerate(grid): is_empty_line = all(not cell for cell in row) if is_empty_line and not added_header: # Skip leading blank lines continue line = [] for col_idx, cell in enumerate(row): if cell is None: cell = "" padding = col_widths[col_idx] - len(str(cell)) line.append(f" {cell}{' ' * padding} ") markdown_lines.append("|" + "|".join(line) + "|") if not added_header: # Skip empty lines when adding the header row add_header_line() added_header = True # Handle one row tables if total_rows == 1: add_header_line() table_md = "\n".join(markdown_lines) return "\n\n" + table_md + "\n\n" def convert_a(self, el, text, parent_tags): text = self.escape(text) # Escape brackets and parentheses in text text = re.sub(r"([\[\]()])", r"\\\1", text) return super().convert_a(el, text, parent_tags) def convert_span(self, el, text, parent_tags): if el.get("id"): return f'<span id="{el["id"]}">{text}</span>' else: return text def escape(self, text, parent_tags=None): text = super().escape(text, parent_tags) if self.options["escape_dollars"]: text = text.replace("$", r"\$") return text def process_text(self, el, parent_tags=None): text = six.text_type(el) or "" # normalize whitespace if we're not inside a preformatted element if not el.find_parent("pre"): text = re_whitespace.sub(" ", text) # escape special characters if we're not inside a preformatted or code element if not el.find_parent(["pre", "code", "kbd", "samp", "math"]): text = self.escape(text) # remove trailing whitespaces if any of the following condition is true: # - current text node is the last node in li # - current text node is followed by an embedded list if el.parent.name == "li" and ( not el.next_sibling or el.next_sibling.name in ["ul", "ol"] ): text = text.rstrip() return text class MarkdownOutput(BaseModel): markdown: str images: dict metadata: dict class MarkdownRenderer(HTMLRenderer): page_separator: Annotated[ str, "The separator to use between pages.", "Default is '-' * 48." ] = "-" * 48 inline_math_delimiters: Annotated[ Tuple[str], "The delimiters to use for inline math." ] = ("$", "$") block_math_delimiters: Annotated[ Tuple[str], "The delimiters to use for block math." ] = ("$$", "$$") html_tables_in_markdown: Annotated[ bool, "Return tables formatted as HTML, instead of in markdown" ] = False @property def md_cls(self): return Markdownify( self.paginate_output, self.page_separator, heading_style="ATX", bullets="-", escape_misc=False, escape_underscores=True, escape_asterisks=True, escape_dollars=True, sub_symbol="<sub>", sup_symbol="<sup>", inline_math_delimiters=self.inline_math_delimiters, block_math_delimiters=self.block_math_delimiters, html_tables_in_markdown=self.html_tables_in_markdown ) def __call__(self, document: Document) -> MarkdownOutput: document_output = document.render(self.block_config) full_html, images = self.extract_html(document, document_output) markdown = self.md_cls.convert(full_html) markdown = cleanup_text(markdown) # Ensure we set the correct blanks for pagination markers if self.paginate_output: if not markdown.startswith("\n\n"): markdown = "\n\n" + markdown if markdown.endswith(self.page_separator): markdown += "\n\n" return MarkdownOutput( markdown=markdown, images=images, metadata=self.generate_document_metadata(document, document_output), ) ``` -------------------------------------------------------------------------------- /marker/processors/llm/llm_page_correction.py: -------------------------------------------------------------------------------- ```python import json from concurrent.futures import ThreadPoolExecutor, as_completed from typing import List, Annotated from marker.logger import get_logger from marker.processors.llm import BaseLLMComplexBlockProcessor from marker.schema import BlockTypes from marker.schema.blocks import BlockId from marker.schema.document import Document from marker.schema.groups import PageGroup from pydantic import BaseModel from tqdm import tqdm logger = get_logger() FORMAT_TAGS = ["b", "i", "u", "del", "math", "sub", "sup", "a", "code", "p", "img"] BLOCK_MAP = { "Text": [], "TextInlineMath": [], "Table": ["table", "tbody", "tr", "td", "th"], "ListGroup": ["ul", "li"], "SectionHeader": [], "Form": ["form", "input", "select", "textarea", "table", "tbody", "tr", "td", "th"], "Figure": [], "Picture": [], "Code": ["pre"], "TableOfContents": ["table", "tbody", "tr", "td", "th"], } ALL_TAGS = FORMAT_TAGS + [tag for tags in BLOCK_MAP.values() for tag in tags] class LLMPageCorrectionProcessor(BaseLLMComplexBlockProcessor): block_correction_prompt: Annotated[ str, "The user prompt to guide the block correction process." ] = None default_user_prompt = """Your goal is to reformat the blocks to be as correct as possible, without changing the underlying meaning of the text within the blocks. Mostly focus on reformatting the content. Ignore minor formatting issues like extra <i> tags.""" page_prompt = """You're a text correction expert specializing in accurately reproducing text from PDF pages. You will be given a JSON list of blocks on a PDF page, along with the image for that page. The blocks will be formatted like the example below. The blocks will be presented in reading order. ```json [ { "bbox": [x1, y1, x2, y2], "id": "/page/0/Text/1", "block_type": "Text", "html": "<p>Some text here</p>", }, ... ] ``` You will also be given a prompt from the user that tells you how to correct the blocks. Your task is to analyze the blocks and the image, then follow the prompt to correct the blocks. Here are the types of changes you can make in response to the prompt: - Reorder the blocks to reflect the correct reading order. - Change the block type to the correct type - the potential types are "SectionHeader", "Form", "Text", "Table", "Figure", "Picture", "ListGroup", "PageFooter", "PageHeader", "Footnote", or "Equation". In this case, update the html as well to match the new block type. - Make edits to block content by changing the HTML. Guidelines: - Only use the following tags: {{format_tags}}. Do not use any other tags. - The math tag can have the attribute `display="block"` to indicate display math, the a tag can have the attribute `href="..."` to indicate a link, and td and th tags can have the attribute `colspan="..."` and `rowspan="..."` to indicate table cells that span multiple columns or rows. There can be a "block-type" attribute on p tags. Do not use any other attributes. - Keep LaTeX formulas inside <math> tags - these are important for downstream processing. - Bboxes are normalized 0-1000 - The order of the JSON list is the reading order for the blocks - Follow the user prompt faithfully, and only make additional changes if there is a significant issue with correctness. - Stay faithful to the original image, and do not insert any content that is not present in the image or the blocks, unless specifically requested by the user prompt. **Instructions:** 1. Carefully examine the provided JSON representation of the page, along with the image. 2. Analyze the user prompt. 3. Identify any issues you'll need to fix, and write a short analysis. 4. If everything is fine, output "no_corrections" Otherwise, output the type of correction needed: ["reorder", "rewrite", "reorder_first"]. Rewrite includes rewriting html and changing the block type. If you need to do both, then perform only the reordering, and output "reorder_first", so we can do the rewriting later. 5. If corrections are needed, output any blocks that need updates: a. If reading order needs to be changed, output the IDs of the blocks in the correct order, and keep block_type and html blank, like this: ```json [ { "id": "/page/0/Text/1", "block_type": "", "html": "" }, ... ] b. If blocks need to be rewritten, output the block ids and new HTML for the blocks, like this: ```json [ { "id": "/page/0/Text/1", "block_type": "Text", "html": "<p>New HTML content here</p>" }, ... ] ``` **Example:** Input: Blocks ```json [ { "bbox": [x1, y1, x2, y2], "id": "/page/0/Text/1", "block_type": "Text", "html": "1.14 Vector Operations", }, { "bbox": [x1, y1, x2, y2], "id": "/page/0/Text/2", "block_type": "Text", "html": "<p>You can perform many operations on a vector, including...</p>", }, ] ``` User Prompt Ensure that all blocks have the correct labels, and that reading order is correct. Output: Analysis: The blocks are in the correct reading order, but the first block should actually be a SectionHeader. ```json [ { "id": "/page/0/Text/1", "block_type": "SectionHeader", "html": "<h1>1.14 Vector Operations</h1>" } ] ``` **Input:** Blocks ```json {{page_json}} ``` User Prompt {{user_prompt}} """ def get_selected_blocks( self, document: Document, page: PageGroup, ) -> List[dict]: selected_blocks = page.structure_blocks(document) json_blocks = [ self.normalize_block_json(block, document, page) for i, block in enumerate(selected_blocks) ] return json_blocks def process_rewriting(self, document: Document, page1: PageGroup): page_blocks = self.get_selected_blocks(document, page1) image = page1.get_image(document, highres=False) prompt = ( self.page_prompt.replace("{{page_json}}", json.dumps(page_blocks)) .replace("{{format_tags}}", json.dumps(ALL_TAGS)) .replace("{{user_prompt}}", self.block_correction_prompt) ) response = self.llm_service(prompt, image, page1, PageSchema) logger.debug(f"Got reponse from LLM: {response}") if not response or "correction_type" not in response: logger.warning("LLM did not return a valid response") return correction_type = response["correction_type"] if correction_type == "no_corrections": return elif correction_type in ["reorder", "reorder_first"]: self.load_blocks(response) self.handle_reorder(response["blocks"], page1) # If we needed to reorder first, we will handle the rewriting next if correction_type == "reorder_first": self.process_rewriting(document, page1) elif correction_type == "rewrite": self.load_blocks(response) self.handle_rewrites(response["blocks"], document) else: logger.warning(f"Unknown correction type: {correction_type}") return def load_blocks(self, response): if isinstance(response["blocks"], str): response["blocks"] = json.loads(response["blocks"]) def handle_reorder(self, blocks: list, page1: PageGroup): unique_page_ids = set() document_page_ids = [str(page1.page_id)] document_pages = [page1] for block_data in blocks: try: page_id, _, _ = block_data["id"].split("/") unique_page_ids.add(page_id) except Exception as e: logger.debug(f"Error parsing block ID {block_data['id']}: {e}") continue if set(document_page_ids) != unique_page_ids: logger.debug( "Some page IDs in the response do not match the document's pages" ) return for page_id, document_page in zip(unique_page_ids, document_pages): block_ids_for_page = [] for block_data in blocks: try: page_id, block_type, block_id = block_data["id"].split("/") block_id = BlockId( page_id=page_id, block_id=block_id, block_type=getattr(BlockTypes, block_type), ) block_ids_for_page.append(block_id) except Exception as e: logger.debug(f"Error parsing block ID {block_data['id']}: {e}") continue # Both sides should have the same values, just be reordered if not all( [ block_id in document_page.structure for block_id in block_ids_for_page ] ): logger.debug( f"Some blocks for page {page_id} not found in document" ) continue if not all( [ block_id in block_ids_for_page for block_id in document_page.structure ] ): logger.debug( f"Some blocks in document page {page_id} not found in response" ) continue # Swap the order of blocks in the document page document_page.structure = block_ids_for_page def handle_rewrites(self, blocks: list, document: Document): for block_data in blocks: try: block_id = block_data["id"].strip().lstrip("/") _, page_id, block_type, block_id = block_id.split("/") block_id = BlockId( page_id=page_id, block_id=block_id, block_type=getattr(BlockTypes, block_type), ) block = document.get_block(block_id) if not block: logger.debug(f"Block {block_id} not found in document") continue if hasattr(block, "html"): block.html = block_data["html"] except Exception as e: logger.debug(f"Error parsing block ID {block_data['id']}: {e}") continue def rewrite_blocks(self, document: Document): if not self.block_correction_prompt: return # Don't show progress if there are no blocks to process total_blocks = len(document.pages) if total_blocks == 0: return pbar = tqdm( total=max(1, total_blocks - 1), desc=f"{self.__class__.__name__} running", disable=self.disable_tqdm, ) with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor: for future in as_completed( [ executor.submit(self.process_rewriting, document, page) for page in document.pages ] ): future.result() # Raise exceptions if any occurred pbar.update(1) pbar.close() class BlockSchema(BaseModel): id: str html: str block_type: str class PageSchema(BaseModel): analysis: str correction_type: str blocks: List[BlockSchema] ``` -------------------------------------------------------------------------------- /marker/schema/blocks/base.py: -------------------------------------------------------------------------------- ```python from __future__ import annotations from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Sequence, Tuple from pydantic import BaseModel, ConfigDict, field_validator from PIL import Image from marker.schema import BlockTypes from marker.schema.polygon import PolygonBox if TYPE_CHECKING: from marker.schema.document import Document from marker.schema.groups.page import PageGroup class BlockMetadata(BaseModel): llm_request_count: int = 0 llm_error_count: int = 0 llm_tokens_used: int = 0 previous_text: str = "" previous_type: str = "" previous_order: int = 0 def merge(self, model2): return self.__class__( **{ field: getattr(self, field) + getattr(model2, field) for field in self.model_fields } ) class BlockOutput(BaseModel): html: str polygon: PolygonBox id: BlockId children: List[BlockOutput] | None = None section_hierarchy: Dict[int, BlockId] | None = None class BlockId(BaseModel): page_id: int block_id: Optional[int] = None block_type: BlockTypes | None = None def __str__(self): if self.block_type is None or self.block_id is None: return f"/page/{self.page_id}" return f"/page/{self.page_id}/{self.block_type.name}/{self.block_id}" def __hash__(self): return hash(str(self)) def __repr__(self): return str(self) def __eq__(self, other): if not isinstance(other, (BlockId, str)): return NotImplemented if isinstance(other, str): return str(self) == other else: return ( self.page_id == other.page_id and self.block_id == other.block_id and self.block_type == other.block_type ) @field_validator("block_type") @classmethod def validate_block_type(cls, v): from marker.schema import BlockTypes if v not in BlockTypes: raise ValueError(f"Invalid block type: {v}") return v def to_path(self): return str(self).replace("/", "_") class Block(BaseModel): polygon: PolygonBox block_description: str block_type: Optional[BlockTypes] = None block_id: Optional[int] = None page_id: Optional[int] = None text_extraction_method: Optional[Literal["pdftext", "surya", "gemini"]] = None structure: List[BlockId] | None = ( None # The top-level page structure, which is the block ids in order ) ignore_for_output: bool = False # Whether this block should be ignored in output replace_output_newlines: bool = ( False # Whether to replace newlines with spaces in output ) source: Literal["layout", "heuristics", "processor"] = "layout" top_k: Optional[Dict[BlockTypes, float]] = None metadata: BlockMetadata | None = None lowres_image: Image.Image | None = None highres_image: Image.Image | None = None removed: bool = False # Has block been replaced by new block? _metadata: Optional[dict] = None model_config = ConfigDict(arbitrary_types_allowed=True) @property def id(self) -> BlockId: return BlockId( page_id=self.page_id, block_id=self.block_id, block_type=self.block_type ) @classmethod def from_block(cls, block: Block) -> Block: block_attrs = block.model_dump(exclude=["id", "block_id", "block_type"]) return cls(**block_attrs) def set_internal_metadata(self, key, data): if self._metadata is None: self._metadata = {} self._metadata[key] = data def get_internal_metadata(self, key): if self._metadata is None: return None return self._metadata.get(key) def get_image( self, document: Document, highres: bool = False, expansion: Tuple[float, float] | None = None, remove_blocks: Sequence[BlockTypes] | None = None, ) -> Image.Image | None: image = self.highres_image if highres else self.lowres_image if image is None: page = document.get_page(self.page_id) page_image = page.get_image(highres=highres, remove_blocks=remove_blocks) # Scale to the image size bbox = self.polygon.rescale( (page.polygon.width, page.polygon.height), page_image.size ) if expansion: bbox = bbox.expand(*expansion) bbox = bbox.bbox image = page_image.crop(bbox) return image def structure_blocks(self, document_page: Document | PageGroup) -> List[Block]: if self.structure is None: return [] return [document_page.get_block(block_id) for block_id in self.structure] def get_prev_block( self, document_page: Document | PageGroup, block: Block, ignored_block_types: Optional[List[BlockTypes]] = None, ): if ignored_block_types is None: ignored_block_types = [] structure_idx = self.structure.index(block.id) if structure_idx == 0: return None for prev_block_id in reversed(self.structure[:structure_idx]): if prev_block_id.block_type not in ignored_block_types: return document_page.get_block(prev_block_id) def get_next_block( self, document_page: Document | PageGroup, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None, ): if ignored_block_types is None: ignored_block_types = [] structure_idx = 0 if block is not None: structure_idx = self.structure.index(block.id) + 1 for next_block_id in self.structure[structure_idx:]: if next_block_id.block_type not in ignored_block_types: return document_page.get_block(next_block_id) return None # No valid next block found def add_structure(self, block: Block): if self.structure is None: self.structure = [block.id] else: self.structure.append(block.id) def update_structure_item(self, old_id: BlockId, new_id: BlockId): if self.structure is not None: for i, item in enumerate(self.structure): if item == old_id: self.structure[i] = new_id break def remove_structure_items(self, block_ids: List[BlockId]): if self.structure is not None: self.structure = [item for item in self.structure if item not in block_ids] def raw_text(self, document: Document) -> str: from marker.schema.text.line import Line from marker.schema.text.span import Span from marker.schema.blocks.tablecell import TableCell if self.structure is None: if isinstance(self, (Span, TableCell)): return self.text else: return "" text = "" for block_id in self.structure: block = document.get_block(block_id) text += block.raw_text(document) if isinstance(block, Line) and not text.endswith("\n"): text += "\n" return text def assemble_html( self, document: Document, child_blocks: List[BlockOutput], parent_structure: Optional[List[str]] = None, block_config: Optional[dict] = None, ) -> str: if self.ignore_for_output: return "" template = "" for c in child_blocks: template += f"<content-ref src='{c.id}'></content-ref>" if self.replace_output_newlines: template = template.replace("\n", " ") template = "<p>" + template + "</p>" return template def assign_section_hierarchy(self, section_hierarchy): if self.block_type == BlockTypes.SectionHeader and self.heading_level: levels = list(section_hierarchy.keys()) for level in levels: if level >= self.heading_level: del section_hierarchy[level] section_hierarchy[self.heading_level] = self.id return section_hierarchy def contained_blocks( self, document: Document, block_types: Sequence[BlockTypes] = None ) -> List[Block]: if self.structure is None: return [] blocks = [] for block_id in self.structure: block = document.get_block(block_id) if block.removed: continue if ( block_types is None or block.block_type in block_types ) and not block.removed: blocks.append(block) blocks += block.contained_blocks(document, block_types) return blocks def replace_block(self, block: Block, new_block: Block): if self.structure is not None: for i, item in enumerate(self.structure): if item == block.id: self.structure[i] = new_block.id break def render( self, document: Document, parent_structure: Optional[List[str]] = None, section_hierarchy: dict | None = None, block_config: Optional[dict] = None, ) -> BlockOutput: if block_config is None: block_config = {} child_content = [] if section_hierarchy is None: section_hierarchy = {} section_hierarchy = self.assign_section_hierarchy(section_hierarchy) if self.structure is not None and len(self.structure) > 0: for block_id in self.structure: block = document.get_block(block_id) rendered = block.render( document, self.structure, section_hierarchy, block_config ) section_hierarchy = ( rendered.section_hierarchy.copy() ) # Update the section hierarchy from the peer blocks child_content.append(rendered) return BlockOutput( html=self.assemble_html( document, child_content, parent_structure, block_config ), polygon=self.polygon, id=self.id, children=child_content, section_hierarchy=section_hierarchy, ) def line_height(self, document: Document) -> float: lines = self.contained_blocks(document, (BlockTypes.Line,)) if len(lines) == 0: return 0 return self.polygon.height / len(lines) def update_metadata(self, **kwargs): if self.metadata is None: self.metadata = BlockMetadata() for key, value in kwargs.items(): metadata_attr = getattr(self.metadata, key) if isinstance(metadata_attr, int) and isinstance(value, int): setattr(self.metadata, key, metadata_attr + value) elif isinstance(metadata_attr, str) and isinstance(value, str): setattr(self.metadata, key, value) else: raise ValueError(f"Metadata attribute {key} is not an integer") def handle_html_output( self, document, child_blocks, parent_structure, block_config=None ): if block_config is None: block_config = {} child_ref_blocks = [ block for block in child_blocks if block.id.block_type == BlockTypes.Reference ] html = Block.assemble_html( self, document, child_ref_blocks, parent_structure, block_config ) return html + self.html ``` -------------------------------------------------------------------------------- /marker/processors/llm/llm_table.py: -------------------------------------------------------------------------------- ```python from typing import Annotated, List, Tuple from bs4 import BeautifulSoup from PIL import Image from marker.logger import get_logger from pydantic import BaseModel from marker.processors.llm import BaseLLMComplexBlockProcessor from marker.schema import BlockTypes from marker.schema.blocks import Block, TableCell, Table from marker.schema.document import Document from marker.schema.groups.page import PageGroup from marker.schema.polygon import PolygonBox logger = get_logger() class LLMTableProcessor(BaseLLMComplexBlockProcessor): block_types: Annotated[ Tuple[BlockTypes], "The block types to process.", ] = (BlockTypes.Table, BlockTypes.TableOfContents) max_rows_per_batch: Annotated[ int, "If the table has more rows than this, chunk the table. (LLMs can be inaccurate with a lot of rows)", ] = 60 max_table_rows: Annotated[ int, "The maximum number of rows in a table to process with the LLM processor. Beyond this will be skipped.", ] = 175 table_image_expansion_ratio: Annotated[ float, "The ratio to expand the image by when cropping.", ] = 0 rotation_max_wh_ratio: Annotated[ float, "The maximum width/height ratio for table cells for a table to be considered rotated.", ] = 0.6 max_table_iterations: Annotated[ int, "The maximum number of iterations to attempt rewriting a table.", ] = 2 table_rewriting_prompt: Annotated[ str, "The prompt to use for rewriting text.", "Default is a string containing the Gemini rewriting prompt.", ] = """You are a text correction expert specializing in accurately reproducing text from images. You will receive an image and an html representation of the table in the image. Your task is to correct any errors in the html representation. The html representation should be as faithful to the original table image as possible. The table image may be rotated, but ensure the html representation is not rotated. Make sure to include HTML for the full table, including the opening and closing table tags. Some guidelines: - Reproduce the original values from the image as faithfully as possible. - There may be stray characters in the html representation that don't match the image - fix these. - Ensure column headers match the correct column values. - If you see any inline math in a table cell, fence it with the <math> tag. Block math should be fenced with <math display="block">. - Replace any images in table cells with a description, like "Image: [description]". - Only use the tags th, td, tr, br, span, sup, sub, i, b, math, and table. Only use the attributes display, style, colspan, and rowspan if necessary. You can use br to break up text lines in cells. - Make sure the columns and rows match the image faithfully, and are easily readable and interpretable by a human. **Instructions:** 1. Carefully examine the provided text block image. 2. Analyze the html representation of the table. 3. Write a comparison of the image and the html representation, paying special attention to the column headers matching the correct column values. 4. If the html representation is completely correct, or you cannot read the image properly, then write "No corrections needed." If the html representation has errors, generate the corrected html representation. Output only either the corrected html representation or "No corrections needed." 5. If you made corrections, analyze your corrections against the original image, and provide a score from 1-5, indicating how well the corrected html matches the image, with 5 being perfect. **Example:** Input: ```html <table> <tr> <th>First Name</th> <th>Last Name</th> <th>Age</th> </tr> <tr> <td>John</td> <td>Doe</td> </tr> </table> ``` Output: comparison: The image shows a table with 2 rows and 3 columns. The text and formatting of the html table matches the image. The column headers match the correct column values. ```html No corrections needed. ``` analysis: I did not make any corrections, as the html representation was already accurate. score: 5 **Input:** ```html {block_html} ``` """ def handle_image_rotation(self, children: List[TableCell], image: Image.Image): ratios = [c.polygon.width / c.polygon.height for c in children] if len(ratios) < 2: return image is_rotated = all([r < self.rotation_max_wh_ratio for r in ratios]) if not is_rotated: return image first_col_id = min([c.col_id for c in children]) first_col = [c for c in children if c.col_id == first_col_id] first_col_cell = first_col[0] last_col_id = max([c.col_id for c in children]) if last_col_id == first_col_id: return image last_col_cell = [c for c in children if c.col_id == last_col_id][0] cell_diff = first_col_cell.polygon.y_start - last_col_cell.polygon.y_start if cell_diff == 0: return image if cell_diff > 0: return image.rotate(270, expand=True) else: return image.rotate(90, expand=True) def process_rewriting(self, document: Document, page: PageGroup, block: Table): children: List[TableCell] = block.contained_blocks( document, (BlockTypes.TableCell,) ) if not children: # Happens if table/form processors didn't run return # LLMs don't handle tables with a lot of rows very well unique_rows = set([cell.row_id for cell in children]) row_count = len(unique_rows) row_idxs = sorted(list(unique_rows)) if row_count > self.max_table_rows: return # Inference by chunk to handle long tables better parsed_cells = [] row_shift = 0 block_image = self.extract_image(document, block) block_rescaled_bbox = block.polygon.rescale( page.polygon.size, page.get_image(highres=True).size ).bbox for i in range(0, row_count, self.max_rows_per_batch): batch_row_idxs = row_idxs[i : i + self.max_rows_per_batch] batch_cells = [cell for cell in children if cell.row_id in batch_row_idxs] batch_cell_bboxes = [ cell.polygon.rescale( page.polygon.size, page.get_image(highres=True).size ).bbox for cell in batch_cells ] # bbox relative to the block batch_bbox = [ min([bbox[0] for bbox in batch_cell_bboxes]) - block_rescaled_bbox[0], min([bbox[1] for bbox in batch_cell_bboxes]) - block_rescaled_bbox[1], max([bbox[2] for bbox in batch_cell_bboxes]) - block_rescaled_bbox[0], max([bbox[3] for bbox in batch_cell_bboxes]) - block_rescaled_bbox[1], ] if i == 0: # Ensure first image starts from the beginning batch_bbox[0] = 0 batch_bbox[1] = 0 elif i > row_count - self.max_rows_per_batch + 1: # Ensure final image grabs the entire height and width batch_bbox[2] = block_image.size[0] batch_bbox[3] = block_image.size[1] batch_image = block_image.crop(batch_bbox) block_html = block.format_cells(document, [], None, batch_cells) batch_image = self.handle_image_rotation(batch_cells, batch_image) batch_parsed_cells = self.rewrite_single_chunk( page, block, block_html, batch_cells, batch_image ) if batch_parsed_cells is None: return # Error occurred or no corrections needed for cell in batch_parsed_cells: cell.row_id += row_shift parsed_cells.append(cell) row_shift += max([cell.row_id for cell in batch_parsed_cells]) block.structure = [] for cell in parsed_cells: page.add_full_block(cell) block.add_structure(cell) def rewrite_single_chunk( self, page: PageGroup, block: Block, block_html: str, children: List[TableCell], image: Image.Image, total_iterations: int = 0, ): prompt = self.table_rewriting_prompt.replace("{block_html}", block_html) response = self.llm_service(prompt, image, block, TableSchema) if not response or "corrected_html" not in response: block.update_metadata(llm_error_count=1) return corrected_html = response["corrected_html"] # The original table is okay if "no corrections needed" in corrected_html.lower(): return corrected_html = corrected_html.strip().lstrip("```html").rstrip("```").strip() # Re-iterate if low score total_iterations += 1 score = response.get("score", 5) analysis = response.get("analysis", "") logger.debug(f"Got table rewriting score {score} with analysis: {analysis}") if total_iterations < self.max_table_iterations and score < 4: logger.info( f"Table rewriting low score {score}, on iteration {total_iterations}" ) block_html = corrected_html return self.rewrite_single_chunk( page, block, block_html, children, image, total_iterations ) parsed_cells = self.parse_html_table(corrected_html, block, page) if len(parsed_cells) <= 1: block.update_metadata(llm_error_count=1) logger.debug(f"Table parsing issue, only {len(parsed_cells)} cells found") return if not corrected_html.endswith("</table>"): logger.debug( "Table parsing issue, corrected html does not end with </table>" ) block.update_metadata(llm_error_count=1) return return parsed_cells @staticmethod def get_cell_text(element, keep_tags=("br", "i", "b", "span", "math")) -> str: for tag in element.find_all(True): if tag.name not in keep_tags: tag.unwrap() return element.decode_contents() def parse_html_table( self, html_text: str, block: Block, page: PageGroup ) -> List[TableCell]: soup = BeautifulSoup(html_text, "html.parser") table = soup.find("table") if not table: return [] # Initialize grid rows = table.find_all("tr") cells = [] # Find maximum number of columns in colspan-aware way max_cols = 0 for row in rows: row_tds = row.find_all(["td", "th"]) curr_cols = 0 for cell in row_tds: colspan = int(cell.get("colspan", 1)) curr_cols += colspan if curr_cols > max_cols: max_cols = curr_cols grid = [[True] * max_cols for _ in range(len(rows))] for i, row in enumerate(rows): cur_col = 0 row_cells = row.find_all(["td", "th"]) for j, cell in enumerate(row_cells): while cur_col < max_cols and not grid[i][cur_col]: cur_col += 1 if cur_col >= max_cols: logger.info("Table parsing warning: too many columns found") break cell_text = self.get_cell_text(cell).strip() rowspan = min(int(cell.get("rowspan", 1)), len(rows) - i) colspan = min(int(cell.get("colspan", 1)), max_cols - cur_col) cell_rows = list(range(i, i + rowspan)) cell_cols = list(range(cur_col, cur_col + colspan)) if colspan == 0 or rowspan == 0: logger.info("Table parsing issue: invalid colspan or rowspan") continue for r in cell_rows: for c in cell_cols: grid[r][c] = False cell_bbox = [ block.polygon.bbox[0] + cur_col, block.polygon.bbox[1] + i, block.polygon.bbox[0] + cur_col + colspan, block.polygon.bbox[1] + i + rowspan, ] cell_polygon = PolygonBox.from_bbox(cell_bbox) cell_obj = TableCell( text_lines=[cell_text], row_id=i, col_id=cur_col, rowspan=rowspan, colspan=colspan, is_header=cell.name == "th", polygon=cell_polygon, page_id=page.page_id, ) cells.append(cell_obj) cur_col += colspan return cells class TableSchema(BaseModel): comparison: str corrected_html: str analysis: str score: int ``` -------------------------------------------------------------------------------- /marker/schema/groups/page.py: -------------------------------------------------------------------------------- ```python from collections import defaultdict from typing import Any, Dict, List, Optional, Sequence, Tuple, Union import numpy as np from PIL import Image, ImageDraw from pdftext.schema import Reference from pydantic import computed_field from marker.providers import ProviderOutput from marker.schema import BlockTypes from marker.schema.blocks import Block, BlockId, Text from marker.schema.blocks.base import BlockMetadata from marker.schema.groups.base import Group from marker.schema.polygon import PolygonBox from marker.util import matrix_intersection_area, sort_text_lines LINE_MAPPING_TYPE = List[Tuple[int, ProviderOutput]] class PageGroup(Group): block_type: BlockTypes = BlockTypes.Page # This is bytes if it is serialized lowres_image: Image.Image | None | bytes = None highres_image: Image.Image | None | bytes = None children: List[Union[Any, Block]] | None = None layout_sliced: bool = ( False # Whether the layout model had to slice the image (order may be wrong) ) excluded_block_types: Sequence[BlockTypes] = ( BlockTypes.Line, BlockTypes.Span, ) maximum_assignment_distance: float = 20 # pixels block_description: str = "A single page in the document." refs: List[Reference] | None = None ocr_errors_detected: bool = False def incr_block_id(self): if self.block_id is None: self.block_id = 0 else: self.block_id += 1 def add_child(self, block: Block): if self.children is None: self.children = [block] else: self.children.append(block) def get_image( self, *args, highres: bool = False, remove_blocks: Sequence[BlockTypes] | None = None, **kwargs, ): image = self.highres_image if highres else self.lowres_image # Check if RGB, convert if needed if isinstance(image, Image.Image) and image.mode != "RGB": image = image.convert("RGB") # Avoid double OCR for certain elements if remove_blocks: image = image.copy() draw = ImageDraw.Draw(image) bad_blocks = [ block for block in self.current_children if block.block_type in remove_blocks ] for bad_block in bad_blocks: poly = bad_block.polygon.rescale(self.polygon.size, image.size).polygon poly = [(int(p[0]), int(p[1])) for p in poly] draw.polygon(poly, fill="white") return image @computed_field @property def current_children(self) -> List[Block]: return [child for child in self.children if not child.removed] def get_next_block( self, block: Optional[Block] = None, ignored_block_types: Optional[List[BlockTypes]] = None, ): if ignored_block_types is None: ignored_block_types = [] structure_idx = 0 if block is not None: structure_idx = self.structure.index(block.id) + 1 # Iterate over blocks following the given block for next_block_id in self.structure[structure_idx:]: if next_block_id.block_type not in ignored_block_types: return self.get_block(next_block_id) return None # No valid next block found def get_prev_block(self, block: Block): block_idx = self.structure.index(block.id) if block_idx > 0: return self.get_block(self.structure[block_idx - 1]) return None def add_block(self, block_cls: type[Block], polygon: PolygonBox) -> Block: self.incr_block_id() block = block_cls( polygon=polygon, block_id=self.block_id, page_id=self.page_id, ) self.add_child(block) return block def add_full_block(self, block: Block) -> Block: self.incr_block_id() block.block_id = self.block_id self.add_child(block) return block def get_block(self, block_id: BlockId) -> Block | None: block: Block = self.children[block_id.block_id] assert block.block_id == block_id.block_id return block def assemble_html( self, document, child_blocks, parent_structure=None, block_config=None ): template = "" for c in child_blocks: template += f"<content-ref src='{c.id}'></content-ref>" return template def compute_line_block_intersections( self, blocks: List[Block], provider_outputs: List[ProviderOutput] ): max_intersections = {} block_bboxes = [block.polygon.bbox for block in blocks] line_bboxes = [ provider_output.line.polygon.bbox for provider_output in provider_outputs ] intersection_matrix = matrix_intersection_area(line_bboxes, block_bboxes) for line_idx, line in enumerate(provider_outputs): intersection_line = intersection_matrix[line_idx] if intersection_line.sum() == 0: continue max_intersection = intersection_line.argmax() max_intersections[line_idx] = ( intersection_matrix[line_idx, max_intersection], blocks[max_intersection].id, ) return max_intersections def compute_max_structure_block_intersection_pct(self): structure_blocks = [self.get_block(block_id) for block_id in self.structure] strucure_block_bboxes = [b.polygon.bbox for b in structure_blocks] intersection_matrix = matrix_intersection_area(strucure_block_bboxes, strucure_block_bboxes) np.fill_diagonal(intersection_matrix, 0) # Ignore self-intersections max_intersection_pct = 0 for block_idx, block in enumerate(structure_blocks): if block.polygon.area == 0: continue max_intersection_pct = max(max_intersection_pct, np.max(intersection_matrix[block_idx]) / block.polygon.area) return max_intersection_pct def replace_block(self, block: Block, new_block: Block): # Handles incrementing the id self.add_full_block(new_block) # Replace block id in structure super().replace_block(block, new_block) # Replace block in structure of children for child in self.children: child.replace_block(block, new_block) # Mark block as removed block.removed = True def identify_missing_blocks( self, provider_line_idxs: List[int], provider_outputs: List[ProviderOutput], assigned_line_idxs: set[int], ): new_blocks = [] new_block = None for line_idx in provider_line_idxs: if line_idx in assigned_line_idxs: continue # if the unassociated line is a new line with minimal area, we can skip it if ( provider_outputs[line_idx].line.polygon.area <= 1 and provider_outputs[line_idx].raw_text == "\n" ): continue if new_block is None: new_block = [(line_idx, provider_outputs[line_idx])] elif all( [ new_block[-1][0] + 1 == line_idx, provider_outputs[line_idx].line.polygon.center_distance( new_block[-1][1].line.polygon ) < self.maximum_assignment_distance, ] ): new_block.append((line_idx, provider_outputs[line_idx])) else: new_blocks.append(new_block) new_block = [(line_idx, provider_outputs[line_idx])] assigned_line_idxs.add(line_idx) if new_block: new_blocks.append(new_block) return new_blocks def create_missing_blocks( self, new_blocks: List[LINE_MAPPING_TYPE], block_lines: Dict[BlockId, LINE_MAPPING_TYPE], ): for new_block in new_blocks: block = self.add_block(Text, new_block[0][1].line.polygon) block.source = "heuristics" block_lines[block.id] = new_block min_dist_idx = None min_dist = None for existing_block_id in self.structure: existing_block = self.get_block(existing_block_id) if existing_block.block_type in self.excluded_block_types: continue # We want to assign to blocks closer in y than x dist = block.polygon.center_distance( existing_block.polygon, x_weight=5, absolute=True ) if dist > 0 and min_dist_idx is None or dist < min_dist: min_dist = dist min_dist_idx = existing_block.id if min_dist_idx is not None: existing_idx = self.structure.index(min_dist_idx) self.structure.insert(existing_idx + 1, block.id) else: self.structure.append(block.id) def add_initial_blocks( self, block_lines: Dict[BlockId, LINE_MAPPING_TYPE], text_extraction_method: str, keep_chars: bool = False, ): # Add lines to the proper blocks, sorted in order for block_id, lines in block_lines.items(): line_extraction_methods = set( [line[1].line.text_extraction_method for line in lines] ) if len(line_extraction_methods) == 1: lines = sorted(lines, key=lambda x: x[0]) lines = [line for _, line in lines] else: lines = [line for _, line in lines] line_polygons = [line.line.polygon for line in lines] sorted_line_polygons = sort_text_lines(line_polygons) argsort = [line_polygons.index(p) for p in sorted_line_polygons] lines = [lines[i] for i in argsort] block = self.get_block(block_id) for provider_output in lines: line = provider_output.line spans = provider_output.spans self.add_full_block(line) block.add_structure(line) block.polygon = block.polygon.merge([line.polygon]) block.text_extraction_method = text_extraction_method for span_idx, span in enumerate(spans): self.add_full_block(span) line.add_structure(span) if not keep_chars: continue # Provider doesn't have chars if len(provider_output.chars) == 0: continue # Loop through characters associated with the span for char in provider_output.chars[span_idx]: char.page_id = self.page_id self.add_full_block(char) span.add_structure(char) def merge_blocks( self, provider_outputs: List[ProviderOutput], text_extraction_method: str, keep_chars: bool = False, ): provider_line_idxs = list(range(len(provider_outputs))) valid_blocks = [ block for block in self.current_children # ensure we only look at children that haven't been replaced if block.block_type not in self.excluded_block_types ] max_intersections = self.compute_line_block_intersections( valid_blocks, provider_outputs ) # Try to assign lines by intersection assigned_line_idxs = set() block_lines = defaultdict(list) for line_idx, provider_output in enumerate(provider_outputs): if line_idx in max_intersections: block_id = max_intersections[line_idx][1] block_lines[block_id].append((line_idx, provider_output)) assigned_line_idxs.add(line_idx) # If no intersection, assign by distance for line_idx in set(provider_line_idxs).difference(assigned_line_idxs): min_dist = None min_dist_idx = None provider_output: ProviderOutput = provider_outputs[line_idx] line = provider_output.line for block in valid_blocks: # We want to assign to blocks closer in y than x dist = line.polygon.center_distance(block.polygon, x_weight=5) if min_dist_idx is None or dist < min_dist: min_dist = dist min_dist_idx = block.id if min_dist_idx is not None and min_dist < self.maximum_assignment_distance: block_lines[min_dist_idx].append((line_idx, provider_output)) assigned_line_idxs.add(line_idx) # This creates new blocks to hold anything too far away new_blocks = self.identify_missing_blocks( provider_line_idxs, provider_outputs, assigned_line_idxs ) self.create_missing_blocks(new_blocks, block_lines) # Add blocks to the page self.add_initial_blocks(block_lines, text_extraction_method, keep_chars) def aggregate_block_metadata(self) -> BlockMetadata: if self.metadata is None: self.metadata = BlockMetadata() for block in self.current_children: if block.metadata is not None: self.metadata = self.metadata.merge(block.metadata) return self.metadata ``` -------------------------------------------------------------------------------- /marker/builders/line.py: -------------------------------------------------------------------------------- ```python from copy import deepcopy from typing import Annotated, List, Tuple import numpy as np from PIL import Image import cv2 from surya.detection import DetectionPredictor from surya.ocr_error import OCRErrorPredictor from marker.builders import BaseBuilder from marker.providers import ProviderOutput, ProviderPageLines from marker.providers.pdf import PdfProvider from marker.schema import BlockTypes from marker.schema.document import Document from marker.schema.groups.page import PageGroup from marker.schema.polygon import PolygonBox from marker.schema.registry import get_block_class from marker.schema.text.line import Line from marker.settings import settings from marker.util import matrix_intersection_area, sort_text_lines from marker.utils.image import is_blank_image class LineBuilder(BaseBuilder): """ A builder for detecting text lines. Merges the detected lines with the lines from the provider """ detection_batch_size: Annotated[ int, "The batch size to use for the detection model.", "Default is None, which will use the default batch size for the model.", ] = None ocr_error_batch_size: Annotated[ int, "The batch size to use for the ocr error detection model.", "Default is None, which will use the default batch size for the model.", ] = None layout_coverage_min_lines: Annotated[ int, "The minimum number of PdfProvider lines that must be covered by the layout model", "to consider the lines from the PdfProvider valid.", ] = 1 layout_coverage_threshold: Annotated[ float, "The minimum coverage ratio required for the layout model to consider", "the lines from the PdfProvider valid.", ] = 0.25 min_document_ocr_threshold: Annotated[ float, "If less pages than this threshold are good, OCR will happen in the document. Otherwise it will not.", ] = 0.85 provider_line_provider_line_min_overlap_pct: Annotated[ float, "The percentage of a provider line that has to be covered by a detected line", ] = 0.1 excluded_for_coverage: Annotated[ Tuple[BlockTypes], "A list of block types to exclude from the layout coverage check.", ] = ( BlockTypes.Figure, BlockTypes.Picture, BlockTypes.Table, BlockTypes.FigureGroup, BlockTypes.TableGroup, BlockTypes.PictureGroup, ) ocr_remove_blocks: Tuple[BlockTypes, ...] = ( BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents, ) disable_tqdm: Annotated[ bool, "Disable tqdm progress bars.", ] = False disable_ocr: Annotated[ bool, "Disable OCR for the document. This will only use the lines from the provider.", ] = False keep_chars: Annotated[bool, "Keep individual characters."] = False detection_line_min_confidence: Annotated[float, "Minimum confidence for a detected line to be included"] = 0.8 def __init__( self, detection_model: DetectionPredictor, ocr_error_model: OCRErrorPredictor, config=None, ): super().__init__(config) self.detection_model = detection_model self.ocr_error_model = ocr_error_model def __call__(self, document: Document, provider: PdfProvider): # Disable inline detection for documents where layout model doesn't detect any equations # Also disable if we won't use the inline detections (if we aren't using the LLM) provider_lines, ocr_lines = self.get_all_lines(document, provider) self.merge_blocks(document, provider_lines, ocr_lines) def get_detection_batch_size(self): if self.detection_batch_size is not None: return self.detection_batch_size elif settings.TORCH_DEVICE_MODEL == "cuda": return 10 return 4 def get_ocr_error_batch_size(self): if self.ocr_error_batch_size is not None: return self.ocr_error_batch_size elif settings.TORCH_DEVICE_MODEL == "cuda": return 14 return 4 def get_detection_results( self, page_images: List[Image.Image], run_detection: List[bool] ): self.detection_model.disable_tqdm = self.disable_tqdm page_detection_results = self.detection_model( images=page_images, batch_size=self.get_detection_batch_size() ) assert len(page_detection_results) == sum(run_detection) detection_results = [] idx = 0 for good in run_detection: if good: detection_results.append(page_detection_results[idx]) idx += 1 else: detection_results.append(None) assert idx == len(page_images) assert len(run_detection) == len(detection_results) return detection_results def get_all_lines(self, document: Document, provider: PdfProvider): ocr_error_detection_results = self.ocr_error_detection( document.pages, provider.page_lines ) boxes_to_ocr = {page.page_id: [] for page in document.pages} page_lines = {page.page_id: [] for page in document.pages} LineClass: Line = get_block_class(BlockTypes.Line) layout_good = [] for document_page, ocr_error_detection_label in zip( document.pages, ocr_error_detection_results.labels ): document_page.ocr_errors_detected = ocr_error_detection_label == "bad" provider_lines: List[ProviderOutput] = provider.page_lines.get( document_page.page_id, [] ) provider_lines_good = all( [ bool(provider_lines), not document_page.ocr_errors_detected, self.check_layout_coverage(document_page, provider_lines), self.check_line_overlaps( document_page, provider_lines ), # Ensure provider lines don't overflow the page or intersect ] ) if self.disable_ocr: provider_lines_good = True layout_good.append(provider_lines_good) run_detection = [not good for good in layout_good] page_images = [ page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks) for page, bad in zip(document.pages, run_detection) if bad ] # Note: run_detection is longer than page_images, since it has a value for each page, not just good ones # Detection results and inline detection results are for every page (we use run_detection to make the list full length) detection_results = self.get_detection_results(page_images, run_detection) assert len(detection_results) == len(layout_good) == len(document.pages) for document_page, detection_result, provider_lines_good in zip( document.pages, detection_results, layout_good ): provider_lines: List[ProviderOutput] = provider.page_lines.get( document_page.page_id, [] ) # Setup detection results detection_boxes = [] if detection_result: detection_boxes = [ PolygonBox(polygon=box.polygon) for box in detection_result.bboxes if box.confidence > self.detection_line_min_confidence ] detection_boxes = sort_text_lines(detection_boxes) if provider_lines_good: document_page.text_extraction_method = "pdftext" # Mark extraction method as pdftext, since all lines are good for provider_line in provider_lines: provider_line.line.text_extraction_method = "pdftext" page_lines[document_page.page_id] = provider_lines else: document_page.text_extraction_method = "surya" boxes_to_ocr[document_page.page_id].extend(detection_boxes) # Dummy lines to merge into the document - Contains no spans, will be filled in later by OCRBuilder ocr_lines = {document_page.page_id: [] for document_page in document.pages} for page_id, page_ocr_boxes in boxes_to_ocr.items(): page_size = provider.get_page_bbox(page_id).size image_size = document.get_page(page_id).get_image(highres=False).size for box_to_ocr in page_ocr_boxes: line_polygon = PolygonBox(polygon=box_to_ocr.polygon).rescale( image_size, page_size ) ocr_lines[page_id].append( ProviderOutput( line=LineClass( polygon=line_polygon, page_id=page_id, text_extraction_method="surya", ), spans=[], chars=[], ) ) return page_lines, ocr_lines def ocr_error_detection( self, pages: List[PageGroup], provider_page_lines: ProviderPageLines ): page_texts = [] for document_page in pages: provider_lines = provider_page_lines.get(document_page.page_id, []) page_text = "\n".join( " ".join(s.text for s in line.spans) for line in provider_lines ) page_texts.append(page_text) self.ocr_error_model.disable_tqdm = self.disable_tqdm ocr_error_detection_results = self.ocr_error_model( page_texts, batch_size=int(self.get_ocr_error_batch_size()) ) return ocr_error_detection_results def check_line_overlaps( self, document_page: PageGroup, provider_lines: List[ProviderOutput] ) -> bool: provider_bboxes = [line.line.polygon.bbox for line in provider_lines] # Add a small margin to account for minor overflows page_bbox = document_page.polygon.expand(5, 5).bbox for bbox in provider_bboxes: if bbox[0] < page_bbox[0]: return False if bbox[1] < page_bbox[1]: return False if bbox[2] > page_bbox[2]: return False if bbox[3] > page_bbox[3]: return False intersection_matrix = matrix_intersection_area(provider_bboxes, provider_bboxes) for i, line in enumerate(provider_lines): intersect_counts = np.sum( intersection_matrix[i] > self.provider_line_provider_line_min_overlap_pct ) # There should be one intersection with itself if intersect_counts > 2: return False return True def check_layout_coverage( self, document_page: PageGroup, provider_lines: List[ProviderOutput], ): covered_blocks = 0 total_blocks = 0 large_text_blocks = 0 layout_blocks = [ document_page.get_block(block) for block in document_page.structure ] layout_blocks = [ b for b in layout_blocks if b.block_type not in self.excluded_for_coverage ] layout_bboxes = [block.polygon.bbox for block in layout_blocks] provider_bboxes = [line.line.polygon.bbox for line in provider_lines] if len(layout_bboxes) == 0: return True if len(provider_bboxes) == 0: return False intersection_matrix = matrix_intersection_area(layout_bboxes, provider_bboxes) for idx, layout_block in enumerate(layout_blocks): total_blocks += 1 intersecting_lines = np.count_nonzero(intersection_matrix[idx] > 0) if intersecting_lines >= self.layout_coverage_min_lines: covered_blocks += 1 if ( layout_block.polygon.intersection_pct(document_page.polygon) > 0.8 and layout_block.block_type == BlockTypes.Text ): large_text_blocks += 1 coverage_ratio = covered_blocks / total_blocks if total_blocks > 0 else 1 text_okay = coverage_ratio >= self.layout_coverage_threshold # Model will sometimes say there is a single block of text on the page when it is blank if not text_okay and (total_blocks == 1 and large_text_blocks == 1): text_okay = True return text_okay def filter_blank_lines(self, page: PageGroup, lines: List[ProviderOutput]): page_size = (page.polygon.width, page.polygon.height) page_image = page.get_image() image_size = page_image.size good_lines = [] for line in lines: line_polygon_rescaled = deepcopy(line.line.polygon).rescale( page_size, image_size ) line_bbox = line_polygon_rescaled.fit_to_bounds((0, 0, *image_size)).bbox if not is_blank_image(page_image.crop(line_bbox)): good_lines.append(line) return good_lines def merge_blocks( self, document: Document, page_provider_lines: ProviderPageLines, page_ocr_lines: ProviderPageLines, ): for document_page in document.pages: provider_lines: List[ProviderOutput] = page_provider_lines[ document_page.page_id ] ocr_lines: List[ProviderOutput] = page_ocr_lines[document_page.page_id] # Only one or the other will have lines # Filter out blank lines which come from bad provider boxes, or invisible text merged_lines = self.filter_blank_lines( document_page, provider_lines + ocr_lines ) # Text extraction method is overridden later for OCRed documents document_page.merge_blocks( merged_lines, text_extraction_method="pdftext" if provider_lines else "surya", keep_chars=self.keep_chars, ) ```