This is page 2 of 5. Use http://codebase.md/microsoft/markitdown?lines=true&page={x} to view the full context. # Directory Structure ``` ├── .devcontainer │ └── devcontainer.json ├── .dockerignore ├── .gitattributes ├── .github │ ├── dependabot.yml │ └── workflows │ ├── pre-commit.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── packages │ ├── markitdown │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── src │ │ │ └── markitdown │ │ │ ├── __about__.py │ │ │ ├── __init__.py │ │ │ ├── __main__.py │ │ │ ├── _base_converter.py │ │ │ ├── _exceptions.py │ │ │ ├── _markitdown.py │ │ │ ├── _stream_info.py │ │ │ ├── _uri_utils.py │ │ │ ├── converter_utils │ │ │ │ ├── __init__.py │ │ │ │ └── docx │ │ │ │ ├── __init__.py │ │ │ │ ├── math │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── latex_dict.py │ │ │ │ │ └── omml.py │ │ │ │ └── pre_process.py │ │ │ ├── converters │ │ │ │ ├── __init__.py │ │ │ │ ├── _audio_converter.py │ │ │ │ ├── _bing_serp_converter.py │ │ │ │ ├── _csv_converter.py │ │ │ │ ├── _doc_intel_converter.py │ │ │ │ ├── _docx_converter.py │ │ │ │ ├── _epub_converter.py │ │ │ │ ├── _exiftool.py │ │ │ │ ├── _html_converter.py │ │ │ │ ├── _image_converter.py │ │ │ │ ├── _ipynb_converter.py │ │ │ │ ├── _llm_caption.py │ │ │ │ ├── _markdownify.py │ │ │ │ ├── _outlook_msg_converter.py │ │ │ │ ├── _pdf_converter.py │ │ │ │ ├── _plain_text_converter.py │ │ │ │ ├── _pptx_converter.py │ │ │ │ ├── _rss_converter.py │ │ │ │ ├── _transcribe_audio.py │ │ │ │ ├── _wikipedia_converter.py │ │ │ │ ├── _xlsx_converter.py │ │ │ │ ├── _youtube_converter.py │ │ │ │ └── _zip_converter.py │ │ │ └── py.typed │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── _test_vectors.py │ │ │ ├── test_cli_misc.py │ │ │ ├── test_cli_vectors.py │ │ │ ├── test_docintel_html.py │ │ │ ├── test_files │ │ │ │ ├── equations.docx │ │ │ │ ├── random.bin │ │ │ │ ├── test_blog.html │ │ │ │ ├── test_files.zip │ │ │ │ ├── test_llm.jpg │ │ │ │ ├── test_mskanji.csv │ │ │ │ ├── test_notebook.ipynb │ │ │ │ ├── test_outlook_msg.msg │ │ │ │ ├── test_rss.xml │ │ │ │ ├── test_serp.html │ │ │ │ ├── test_wikipedia.html │ │ │ │ ├── test_with_comment.docx │ │ │ │ ├── test.docx │ │ │ │ ├── test.epub │ │ │ │ ├── test.jpg │ │ │ │ ├── test.json │ │ │ │ ├── test.m4a │ │ │ │ ├── test.mp3 │ │ │ │ ├── test.pdf │ │ │ │ ├── test.pptx │ │ │ │ ├── test.wav │ │ │ │ ├── test.xls │ │ │ │ └── test.xlsx │ │ │ ├── test_module_misc.py │ │ │ └── test_module_vectors.py │ │ └── ThirdPartyNotices.md │ ├── markitdown-mcp │ │ ├── Dockerfile │ │ ├── pyproject.toml │ │ ├── README.md │ │ ├── src │ │ │ └── markitdown_mcp │ │ │ ├── __about__.py │ │ │ ├── __init__.py │ │ │ ├── __main__.py │ │ │ └── py.typed │ │ └── tests │ │ └── __init__.py │ └── markitdown-sample-plugin │ ├── pyproject.toml │ ├── README.md │ ├── src │ │ └── markitdown_sample_plugin │ │ ├── __about__.py │ │ ├── __init__.py │ │ ├── _plugin.py │ │ └── py.typed │ └── tests │ ├── __init__.py │ ├── test_files │ │ └── test.rtf │ └── test_sample_plugin.py ├── README.md ├── SECURITY.md └── SUPPORT.md ``` # Files -------------------------------------------------------------------------------- /packages/markitdown/tests/test_cli_vectors.py: -------------------------------------------------------------------------------- ```python 1 | #!/usr/bin/env python3 -m pytest 2 | import os 3 | import time 4 | import pytest 5 | import subprocess 6 | import locale 7 | from typing import List 8 | 9 | if __name__ == "__main__": 10 | from _test_vectors import ( 11 | GENERAL_TEST_VECTORS, 12 | DATA_URI_TEST_VECTORS, 13 | FileTestVector, 14 | ) 15 | else: 16 | from ._test_vectors import ( 17 | GENERAL_TEST_VECTORS, 18 | DATA_URI_TEST_VECTORS, 19 | FileTestVector, 20 | ) 21 | 22 | skip_remote = ( 23 | True if os.environ.get("GITHUB_ACTIONS") else False 24 | ) # Don't run these tests in CI 25 | 26 | TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") 27 | TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files" 28 | 29 | 30 | # Prepare CLI test vectors (remove vectors that require mockig the url) 31 | CLI_TEST_VECTORS: List[FileTestVector] = [] 32 | for test_vector in GENERAL_TEST_VECTORS: 33 | if test_vector.url is not None: 34 | continue 35 | CLI_TEST_VECTORS.append(test_vector) 36 | 37 | 38 | @pytest.fixture(scope="session") 39 | def shared_tmp_dir(tmp_path_factory): 40 | return tmp_path_factory.mktemp("pytest_tmp") 41 | 42 | 43 | @pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) 44 | def test_output_to_stdout(shared_tmp_dir, test_vector) -> None: 45 | """Test that the CLI outputs to stdout correctly.""" 46 | 47 | result = subprocess.run( 48 | [ 49 | "python", 50 | "-m", 51 | "markitdown", 52 | os.path.join(TEST_FILES_DIR, test_vector.filename), 53 | ], 54 | capture_output=True, 55 | text=True, 56 | ) 57 | 58 | assert result.returncode == 0, f"CLI exited with error: {result.stderr}" 59 | for test_string in test_vector.must_include: 60 | assert test_string in result.stdout 61 | for test_string in test_vector.must_not_include: 62 | assert test_string not in result.stdout 63 | 64 | 65 | @pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) 66 | def test_output_to_file(shared_tmp_dir, test_vector) -> None: 67 | """Test that the CLI outputs to a file correctly.""" 68 | 69 | output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output") 70 | result = subprocess.run( 71 | [ 72 | "python", 73 | "-m", 74 | "markitdown", 75 | "-o", 76 | output_file, 77 | os.path.join(TEST_FILES_DIR, test_vector.filename), 78 | ], 79 | capture_output=True, 80 | text=True, 81 | ) 82 | 83 | assert result.returncode == 0, f"CLI exited with error: {result.stderr}" 84 | assert os.path.exists(output_file), f"Output file not created: {output_file}" 85 | 86 | with open(output_file, "r") as f: 87 | output_data = f.read() 88 | for test_string in test_vector.must_include: 89 | assert test_string in output_data 90 | for test_string in test_vector.must_not_include: 91 | assert test_string not in output_data 92 | 93 | os.remove(output_file) 94 | assert not os.path.exists(output_file), f"Output file not deleted: {output_file}" 95 | 96 | 97 | @pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) 98 | def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None: 99 | """Test that the CLI readds from stdin correctly.""" 100 | 101 | test_input = b"" 102 | with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: 103 | test_input = stream.read() 104 | 105 | result = subprocess.run( 106 | [ 107 | "python", 108 | "-m", 109 | "markitdown", 110 | os.path.join(TEST_FILES_DIR, test_vector.filename), 111 | ], 112 | input=test_input, 113 | capture_output=True, 114 | text=False, 115 | ) 116 | 117 | stdout = result.stdout.decode(locale.getpreferredencoding()) 118 | assert ( 119 | result.returncode == 0 120 | ), f"CLI exited with error: {result.stderr.decode('utf-8')}" 121 | for test_string in test_vector.must_include: 122 | assert test_string in stdout 123 | for test_string in test_vector.must_not_include: 124 | assert test_string not in stdout 125 | 126 | 127 | @pytest.mark.skipif( 128 | skip_remote, 129 | reason="do not run tests that query external urls", 130 | ) 131 | @pytest.mark.parametrize("test_vector", CLI_TEST_VECTORS) 132 | def test_convert_url(shared_tmp_dir, test_vector): 133 | """Test the conversion of a stream with no stream info.""" 134 | # Note: tmp_dir is not used here, but is needed to match the signature 135 | 136 | time.sleep(1) # Ensure we don't hit rate limits 137 | result = subprocess.run( 138 | ["python", "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename], 139 | capture_output=True, 140 | text=False, 141 | ) 142 | 143 | stdout = result.stdout.decode(locale.getpreferredencoding()) 144 | assert result.returncode == 0, f"CLI exited with error: {result.stderr}" 145 | for test_string in test_vector.must_include: 146 | assert test_string in stdout 147 | for test_string in test_vector.must_not_include: 148 | assert test_string not in stdout 149 | 150 | 151 | @pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS) 152 | def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None: 153 | """Test CLI functionality when keep_data_uris is enabled""" 154 | 155 | output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output") 156 | result = subprocess.run( 157 | [ 158 | "python", 159 | "-m", 160 | "markitdown", 161 | "--keep-data-uris", 162 | "-o", 163 | output_file, 164 | os.path.join(TEST_FILES_DIR, test_vector.filename), 165 | ], 166 | capture_output=True, 167 | text=True, 168 | ) 169 | 170 | assert result.returncode == 0, f"CLI exited with error: {result.stderr}" 171 | assert os.path.exists(output_file), f"Output file not created: {output_file}" 172 | 173 | with open(output_file, "r") as f: 174 | output_data = f.read() 175 | for test_string in test_vector.must_include: 176 | assert test_string in output_data 177 | for test_string in test_vector.must_not_include: 178 | assert test_string not in output_data 179 | 180 | os.remove(output_file) 181 | assert not os.path.exists(output_file), f"Output file not deleted: {output_file}" 182 | 183 | 184 | if __name__ == "__main__": 185 | import tempfile 186 | 187 | """Runs this file's tests from the command line.""" 188 | 189 | with tempfile.TemporaryDirectory() as tmp_dir: 190 | # General tests 191 | for test_function in [ 192 | test_output_to_stdout, 193 | test_output_to_file, 194 | test_input_from_stdin_without_hints, 195 | test_convert_url, 196 | ]: 197 | for test_vector in CLI_TEST_VECTORS: 198 | print( 199 | f"Running {test_function.__name__} on {test_vector.filename}...", 200 | end="", 201 | ) 202 | test_function(tmp_dir, test_vector) 203 | print("OK") 204 | 205 | # Data URI tests 206 | for test_function in [ 207 | test_output_to_file_with_data_uris, 208 | ]: 209 | for test_vector in DATA_URI_TEST_VECTORS: 210 | print( 211 | f"Running {test_function.__name__} on {test_vector.filename}...", 212 | end="", 213 | ) 214 | test_function(tmp_dir, test_vector) 215 | print("OK") 216 | 217 | print("All tests passed!") 218 | ``` -------------------------------------------------------------------------------- /packages/markitdown/tests/test_module_vectors.py: -------------------------------------------------------------------------------- ```python 1 | #!/usr/bin/env python3 -m pytest 2 | import os 3 | import time 4 | import pytest 5 | import base64 6 | 7 | from pathlib import Path 8 | 9 | if __name__ == "__main__": 10 | from _test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS 11 | else: 12 | from ._test_vectors import GENERAL_TEST_VECTORS, DATA_URI_TEST_VECTORS 13 | 14 | from markitdown import ( 15 | MarkItDown, 16 | StreamInfo, 17 | ) 18 | 19 | skip_remote = ( 20 | True if os.environ.get("GITHUB_ACTIONS") else False 21 | ) # Don't run these tests in CI 22 | 23 | TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") 24 | TEST_FILES_URL = "https://raw.githubusercontent.com/microsoft/markitdown/refs/heads/main/packages/markitdown/tests/test_files" 25 | 26 | 27 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) 28 | def test_guess_stream_info(test_vector): 29 | """Test the ability to guess stream info.""" 30 | markitdown = MarkItDown() 31 | 32 | local_path = os.path.join(TEST_FILES_DIR, test_vector.filename) 33 | expected_extension = os.path.splitext(test_vector.filename)[1] 34 | 35 | with open(local_path, "rb") as stream: 36 | guesses = markitdown._get_stream_info_guesses( 37 | stream, 38 | base_guess=StreamInfo( 39 | filename=os.path.basename(test_vector.filename), 40 | local_path=local_path, 41 | extension=expected_extension, 42 | ), 43 | ) 44 | 45 | # For some limited exceptions, we can't guarantee the exact 46 | # mimetype or extension, so we'll special-case them here. 47 | if test_vector.filename in [ 48 | "test_outlook_msg.msg", 49 | ]: 50 | return 51 | 52 | assert guesses[0].mimetype == test_vector.mimetype 53 | assert guesses[0].extension == expected_extension 54 | assert guesses[0].charset == test_vector.charset 55 | 56 | 57 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) 58 | def test_convert_local(test_vector): 59 | """Test the conversion of a local file.""" 60 | markitdown = MarkItDown() 61 | 62 | result = markitdown.convert( 63 | os.path.join(TEST_FILES_DIR, test_vector.filename), url=test_vector.url 64 | ) 65 | for string in test_vector.must_include: 66 | assert string in result.markdown 67 | for string in test_vector.must_not_include: 68 | assert string not in result.markdown 69 | 70 | 71 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) 72 | def test_convert_stream_with_hints(test_vector): 73 | """Test the conversion of a stream with full stream info.""" 74 | markitdown = MarkItDown() 75 | 76 | stream_info = StreamInfo( 77 | extension=os.path.splitext(test_vector.filename)[1], 78 | mimetype=test_vector.mimetype, 79 | charset=test_vector.charset, 80 | ) 81 | 82 | with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: 83 | result = markitdown.convert( 84 | stream, stream_info=stream_info, url=test_vector.url 85 | ) 86 | for string in test_vector.must_include: 87 | assert string in result.markdown 88 | for string in test_vector.must_not_include: 89 | assert string not in result.markdown 90 | 91 | 92 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) 93 | def test_convert_stream_without_hints(test_vector): 94 | """Test the conversion of a stream with no stream info.""" 95 | markitdown = MarkItDown() 96 | 97 | with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: 98 | result = markitdown.convert(stream, url=test_vector.url) 99 | for string in test_vector.must_include: 100 | assert string in result.markdown 101 | for string in test_vector.must_not_include: 102 | assert string not in result.markdown 103 | 104 | 105 | @pytest.mark.skipif( 106 | skip_remote, 107 | reason="do not run tests that query external urls", 108 | ) 109 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) 110 | def test_convert_http_uri(test_vector): 111 | """Test the conversion of an HTTP:// or HTTPS:// URI.""" 112 | markitdown = MarkItDown() 113 | 114 | time.sleep(1) # Ensure we don't hit rate limits 115 | 116 | result = markitdown.convert( 117 | TEST_FILES_URL + "/" + test_vector.filename, 118 | url=test_vector.url, # Mock where this file would be found 119 | ) 120 | for string in test_vector.must_include: 121 | assert string in result.markdown 122 | for string in test_vector.must_not_include: 123 | assert string not in result.markdown 124 | 125 | 126 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) 127 | def test_convert_file_uri(test_vector): 128 | """Test the conversion of a file:// URI.""" 129 | markitdown = MarkItDown() 130 | 131 | result = markitdown.convert( 132 | Path(os.path.join(TEST_FILES_DIR, test_vector.filename)).as_uri(), 133 | url=test_vector.url, 134 | ) 135 | for string in test_vector.must_include: 136 | assert string in result.markdown 137 | for string in test_vector.must_not_include: 138 | assert string not in result.markdown 139 | 140 | 141 | @pytest.mark.parametrize("test_vector", GENERAL_TEST_VECTORS) 142 | def test_convert_data_uri(test_vector): 143 | """Test the conversion of a data URI.""" 144 | markitdown = MarkItDown() 145 | 146 | data = "" 147 | with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: 148 | data = base64.b64encode(stream.read()).decode("utf-8") 149 | mimetype = test_vector.mimetype 150 | data_uri = f"data:{mimetype};base64,{data}" 151 | 152 | result = markitdown.convert( 153 | data_uri, 154 | url=test_vector.url, 155 | ) 156 | for string in test_vector.must_include: 157 | assert string in result.markdown 158 | for string in test_vector.must_not_include: 159 | assert string not in result.markdown 160 | 161 | 162 | @pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS) 163 | def test_convert_keep_data_uris(test_vector): 164 | """Test API functionality when keep_data_uris is enabled""" 165 | markitdown = MarkItDown() 166 | 167 | # Test local file conversion 168 | result = markitdown.convert( 169 | os.path.join(TEST_FILES_DIR, test_vector.filename), 170 | keep_data_uris=True, 171 | url=test_vector.url, 172 | ) 173 | 174 | for string in test_vector.must_include: 175 | assert string in result.markdown 176 | for string in test_vector.must_not_include: 177 | assert string not in result.markdown 178 | 179 | 180 | @pytest.mark.parametrize("test_vector", DATA_URI_TEST_VECTORS) 181 | def test_convert_stream_keep_data_uris(test_vector): 182 | """Test the conversion of a stream with no stream info.""" 183 | markitdown = MarkItDown() 184 | 185 | stream_info = StreamInfo( 186 | extension=os.path.splitext(test_vector.filename)[1], 187 | mimetype=test_vector.mimetype, 188 | charset=test_vector.charset, 189 | ) 190 | 191 | with open(os.path.join(TEST_FILES_DIR, test_vector.filename), "rb") as stream: 192 | result = markitdown.convert( 193 | stream, stream_info=stream_info, keep_data_uris=True, url=test_vector.url 194 | ) 195 | 196 | for string in test_vector.must_include: 197 | assert string in result.markdown 198 | for string in test_vector.must_not_include: 199 | assert string not in result.markdown 200 | 201 | 202 | if __name__ == "__main__": 203 | """Runs this file's tests from the command line.""" 204 | 205 | # General tests 206 | for test_function in [ 207 | test_guess_stream_info, 208 | test_convert_local, 209 | test_convert_stream_with_hints, 210 | test_convert_stream_without_hints, 211 | test_convert_http_uri, 212 | test_convert_file_uri, 213 | test_convert_data_uri, 214 | ]: 215 | for test_vector in GENERAL_TEST_VECTORS: 216 | print( 217 | f"Running {test_function.__name__} on {test_vector.filename}...", end="" 218 | ) 219 | test_function(test_vector) 220 | print("OK") 221 | 222 | # Data URI tests 223 | for test_function in [ 224 | test_convert_keep_data_uris, 225 | test_convert_stream_keep_data_uris, 226 | ]: 227 | for test_vector in DATA_URI_TEST_VECTORS: 228 | print( 229 | f"Running {test_function.__name__} on {test_vector.filename}...", end="" 230 | ) 231 | test_function(test_vector) 232 | print("OK") 233 | 234 | print("All tests passed!") 235 | ``` -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_youtube_converter.py: -------------------------------------------------------------------------------- ```python 1 | import json 2 | import time 3 | import re 4 | import bs4 5 | from typing import Any, BinaryIO, Dict, List, Union 6 | from urllib.parse import parse_qs, urlparse, unquote 7 | 8 | from .._base_converter import DocumentConverter, DocumentConverterResult 9 | from .._stream_info import StreamInfo 10 | 11 | # Optional YouTube transcription support 12 | try: 13 | # Suppress some warnings on library import 14 | import warnings 15 | 16 | with warnings.catch_warnings(): 17 | warnings.filterwarnings("ignore", category=SyntaxWarning) 18 | # Patch submitted upstream to fix the SyntaxWarning 19 | from youtube_transcript_api import YouTubeTranscriptApi 20 | 21 | IS_YOUTUBE_TRANSCRIPT_CAPABLE = True 22 | except ModuleNotFoundError: 23 | IS_YOUTUBE_TRANSCRIPT_CAPABLE = False 24 | 25 | 26 | ACCEPTED_MIME_TYPE_PREFIXES = [ 27 | "text/html", 28 | "application/xhtml", 29 | ] 30 | 31 | ACCEPTED_FILE_EXTENSIONS = [ 32 | ".html", 33 | ".htm", 34 | ] 35 | 36 | 37 | class YouTubeConverter(DocumentConverter): 38 | """Handle YouTube specially, focusing on the video title, description, and transcript.""" 39 | 40 | def accepts( 41 | self, 42 | file_stream: BinaryIO, 43 | stream_info: StreamInfo, 44 | **kwargs: Any, # Options to pass to the converter 45 | ) -> bool: 46 | """ 47 | Make sure we're dealing with HTML content *from* YouTube. 48 | """ 49 | url = stream_info.url or "" 50 | mimetype = (stream_info.mimetype or "").lower() 51 | extension = (stream_info.extension or "").lower() 52 | 53 | url = unquote(url) 54 | url = url.replace(r"\?", "?").replace(r"\=", "=") 55 | 56 | if not url.startswith("https://www.youtube.com/watch?"): 57 | # Not a YouTube URL 58 | return False 59 | 60 | if extension in ACCEPTED_FILE_EXTENSIONS: 61 | return True 62 | 63 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 64 | if mimetype.startswith(prefix): 65 | return True 66 | 67 | # Not HTML content 68 | return False 69 | 70 | def convert( 71 | self, 72 | file_stream: BinaryIO, 73 | stream_info: StreamInfo, 74 | **kwargs: Any, # Options to pass to the converter 75 | ) -> DocumentConverterResult: 76 | # Parse the stream 77 | encoding = "utf-8" if stream_info.charset is None else stream_info.charset 78 | soup = bs4.BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) 79 | 80 | # Read the meta tags 81 | metadata: Dict[str, str] = {} 82 | 83 | if soup.title and soup.title.string: 84 | metadata["title"] = soup.title.string 85 | 86 | for meta in soup(["meta"]): 87 | if not isinstance(meta, bs4.Tag): 88 | continue 89 | 90 | for a in meta.attrs: 91 | if a in ["itemprop", "property", "name"]: 92 | key = str(meta.get(a, "")) 93 | content = str(meta.get("content", "")) 94 | if key and content: # Only add non-empty content 95 | metadata[key] = content 96 | break 97 | 98 | # Try reading the description 99 | try: 100 | for script in soup(["script"]): 101 | if not isinstance(script, bs4.Tag): 102 | continue 103 | if not script.string: # Skip empty scripts 104 | continue 105 | content = script.string 106 | if "ytInitialData" in content: 107 | match = re.search(r"var ytInitialData = ({.*?});", content) 108 | if match: 109 | data = json.loads(match.group(1)) 110 | attrdesc = self._findKey(data, "attributedDescriptionBodyText") 111 | if attrdesc and isinstance(attrdesc, dict): 112 | metadata["description"] = str(attrdesc.get("content", "")) 113 | break 114 | except Exception as e: 115 | print(f"Error extracting description: {e}") 116 | pass 117 | 118 | # Start preparing the page 119 | webpage_text = "# YouTube\n" 120 | 121 | title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore 122 | assert isinstance(title, str) 123 | 124 | if title: 125 | webpage_text += f"\n## {title}\n" 126 | 127 | stats = "" 128 | views = self._get(metadata, ["interactionCount"]) # type: ignore 129 | if views: 130 | stats += f"- **Views:** {views}\n" 131 | 132 | keywords = self._get(metadata, ["keywords"]) # type: ignore 133 | if keywords: 134 | stats += f"- **Keywords:** {keywords}\n" 135 | 136 | runtime = self._get(metadata, ["duration"]) # type: ignore 137 | if runtime: 138 | stats += f"- **Runtime:** {runtime}\n" 139 | 140 | if len(stats) > 0: 141 | webpage_text += f"\n### Video Metadata\n{stats}\n" 142 | 143 | description = self._get(metadata, ["description", "og:description"]) # type: ignore 144 | if description: 145 | webpage_text += f"\n### Description\n{description}\n" 146 | 147 | if IS_YOUTUBE_TRANSCRIPT_CAPABLE: 148 | ytt_api = YouTubeTranscriptApi() 149 | transcript_text = "" 150 | parsed_url = urlparse(stream_info.url) # type: ignore 151 | params = parse_qs(parsed_url.query) # type: ignore 152 | if "v" in params and params["v"][0]: 153 | video_id = str(params["v"][0]) 154 | transcript_list = ytt_api.list(video_id) 155 | languages = ["en"] 156 | for transcript in transcript_list: 157 | languages.append(transcript.language_code) 158 | break 159 | try: 160 | youtube_transcript_languages = kwargs.get( 161 | "youtube_transcript_languages", languages 162 | ) 163 | # Retry the transcript fetching operation 164 | transcript = self._retry_operation( 165 | lambda: ytt_api.fetch( 166 | video_id, languages=youtube_transcript_languages 167 | ), 168 | retries=3, # Retry 3 times 169 | delay=2, # 2 seconds delay between retries 170 | ) 171 | 172 | if transcript: 173 | transcript_text = " ".join( 174 | [part.text for part in transcript] 175 | ) # type: ignore 176 | except Exception as e: 177 | # No transcript available 178 | if len(languages) == 1: 179 | print(f"Error fetching transcript: {e}") 180 | else: 181 | # Translate transcript into first kwarg 182 | transcript = ( 183 | transcript_list.find_transcript(languages) 184 | .translate(youtube_transcript_languages[0]) 185 | .fetch() 186 | ) 187 | transcript_text = " ".join([part.text for part in transcript]) 188 | if transcript_text: 189 | webpage_text += f"\n### Transcript\n{transcript_text}\n" 190 | 191 | title = title if title else (soup.title.string if soup.title else "") 192 | assert isinstance(title, str) 193 | 194 | return DocumentConverterResult( 195 | markdown=webpage_text, 196 | title=title, 197 | ) 198 | 199 | def _get( 200 | self, 201 | metadata: Dict[str, str], 202 | keys: List[str], 203 | default: Union[str, None] = None, 204 | ) -> Union[str, None]: 205 | """Get first non-empty value from metadata matching given keys.""" 206 | for k in keys: 207 | if k in metadata: 208 | return metadata[k] 209 | return default 210 | 211 | def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type 212 | """Recursively search for a key in nested dictionary/list structures.""" 213 | if isinstance(json, list): 214 | for elm in json: 215 | ret = self._findKey(elm, key) 216 | if ret is not None: 217 | return ret 218 | elif isinstance(json, dict): 219 | for k, v in json.items(): 220 | if k == key: 221 | return json[k] 222 | if result := self._findKey(v, key): 223 | return result 224 | return None 225 | 226 | def _retry_operation(self, operation, retries=3, delay=2): 227 | """Retries the operation if it fails.""" 228 | attempt = 0 229 | while attempt < retries: 230 | try: 231 | return operation() # Attempt the operation 232 | except Exception as e: 233 | print(f"Attempt {attempt + 1} failed: {e}") 234 | if attempt < retries - 1: 235 | time.sleep(delay) # Wait before retrying 236 | attempt += 1 237 | # If all attempts fail, raise the last exception 238 | raise Exception(f"Operation failed after {retries} attempts.") 239 | ``` -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_doc_intel_converter.py: -------------------------------------------------------------------------------- ```python 1 | import sys 2 | import re 3 | import os 4 | from typing import BinaryIO, Any, List 5 | from enum import Enum 6 | 7 | from .._base_converter import DocumentConverter, DocumentConverterResult 8 | from .._stream_info import StreamInfo 9 | from .._exceptions import MissingDependencyException 10 | 11 | # Try loading optional (but in this case, required) dependencies 12 | # Save reporting of any exceptions for later 13 | _dependency_exc_info = None 14 | try: 15 | from azure.ai.documentintelligence import DocumentIntelligenceClient 16 | from azure.ai.documentintelligence.models import ( 17 | AnalyzeDocumentRequest, 18 | AnalyzeResult, 19 | DocumentAnalysisFeature, 20 | ) 21 | from azure.core.credentials import AzureKeyCredential, TokenCredential 22 | from azure.identity import DefaultAzureCredential 23 | except ImportError: 24 | # Preserve the error and stack trace for later 25 | _dependency_exc_info = sys.exc_info() 26 | 27 | # Define these types for type hinting when the package is not available 28 | class AzureKeyCredential: 29 | pass 30 | 31 | class TokenCredential: 32 | pass 33 | 34 | class DocumentIntelligenceClient: 35 | pass 36 | 37 | class AnalyzeDocumentRequest: 38 | pass 39 | 40 | class AnalyzeResult: 41 | pass 42 | 43 | class DocumentAnalysisFeature: 44 | pass 45 | 46 | class DefaultAzureCredential: 47 | pass 48 | 49 | 50 | # TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum. 51 | # This constant is a temporary fix until the bug is resolved. 52 | CONTENT_FORMAT = "markdown" 53 | 54 | 55 | class DocumentIntelligenceFileType(str, Enum): 56 | """Enum of file types supported by the Document Intelligence Converter.""" 57 | 58 | # No OCR 59 | DOCX = "docx" 60 | PPTX = "pptx" 61 | XLSX = "xlsx" 62 | HTML = "html" 63 | # OCR 64 | PDF = "pdf" 65 | JPEG = "jpeg" 66 | PNG = "png" 67 | BMP = "bmp" 68 | TIFF = "tiff" 69 | 70 | 71 | def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]: 72 | """Get the MIME type prefixes for the given file types.""" 73 | prefixes: List[str] = [] 74 | for type_ in types: 75 | if type_ == DocumentIntelligenceFileType.DOCX: 76 | prefixes.append( 77 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document" 78 | ) 79 | elif type_ == DocumentIntelligenceFileType.PPTX: 80 | prefixes.append( 81 | "application/vnd.openxmlformats-officedocument.presentationml" 82 | ) 83 | elif type_ == DocumentIntelligenceFileType.XLSX: 84 | prefixes.append( 85 | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" 86 | ) 87 | elif type_ == DocumentIntelligenceFileType.HTML: 88 | prefixes.append("text/html") 89 | prefixes.append("application/xhtml+xml") 90 | elif type_ == DocumentIntelligenceFileType.PDF: 91 | prefixes.append("application/pdf") 92 | prefixes.append("application/x-pdf") 93 | elif type_ == DocumentIntelligenceFileType.JPEG: 94 | prefixes.append("image/jpeg") 95 | elif type_ == DocumentIntelligenceFileType.PNG: 96 | prefixes.append("image/png") 97 | elif type_ == DocumentIntelligenceFileType.BMP: 98 | prefixes.append("image/bmp") 99 | elif type_ == DocumentIntelligenceFileType.TIFF: 100 | prefixes.append("image/tiff") 101 | return prefixes 102 | 103 | 104 | def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]: 105 | """Get the file extensions for the given file types.""" 106 | extensions: List[str] = [] 107 | for type_ in types: 108 | if type_ == DocumentIntelligenceFileType.DOCX: 109 | extensions.append(".docx") 110 | elif type_ == DocumentIntelligenceFileType.PPTX: 111 | extensions.append(".pptx") 112 | elif type_ == DocumentIntelligenceFileType.XLSX: 113 | extensions.append(".xlsx") 114 | elif type_ == DocumentIntelligenceFileType.PDF: 115 | extensions.append(".pdf") 116 | elif type_ == DocumentIntelligenceFileType.JPEG: 117 | extensions.append(".jpg") 118 | extensions.append(".jpeg") 119 | elif type_ == DocumentIntelligenceFileType.PNG: 120 | extensions.append(".png") 121 | elif type_ == DocumentIntelligenceFileType.BMP: 122 | extensions.append(".bmp") 123 | elif type_ == DocumentIntelligenceFileType.TIFF: 124 | extensions.append(".tiff") 125 | elif type_ == DocumentIntelligenceFileType.HTML: 126 | extensions.append(".html") 127 | return extensions 128 | 129 | 130 | class DocumentIntelligenceConverter(DocumentConverter): 131 | """Specialized DocumentConverter that uses Document Intelligence to extract text from documents.""" 132 | 133 | def __init__( 134 | self, 135 | *, 136 | endpoint: str, 137 | api_version: str = "2024-07-31-preview", 138 | credential: AzureKeyCredential | TokenCredential | None = None, 139 | file_types: List[DocumentIntelligenceFileType] = [ 140 | DocumentIntelligenceFileType.DOCX, 141 | DocumentIntelligenceFileType.PPTX, 142 | DocumentIntelligenceFileType.XLSX, 143 | DocumentIntelligenceFileType.PDF, 144 | DocumentIntelligenceFileType.JPEG, 145 | DocumentIntelligenceFileType.PNG, 146 | DocumentIntelligenceFileType.BMP, 147 | DocumentIntelligenceFileType.TIFF, 148 | ], 149 | ): 150 | """ 151 | Initialize the DocumentIntelligenceConverter. 152 | 153 | Args: 154 | endpoint (str): The endpoint for the Document Intelligence service. 155 | api_version (str): The API version to use. Defaults to "2024-07-31-preview". 156 | credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication. 157 | file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types. 158 | """ 159 | 160 | super().__init__() 161 | self._file_types = file_types 162 | 163 | # Raise an error if the dependencies are not available. 164 | # This is different than other converters since this one isn't even instantiated 165 | # unless explicitly requested. 166 | if _dependency_exc_info is not None: 167 | raise MissingDependencyException( 168 | "DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`" 169 | ) from _dependency_exc_info[ 170 | 1 171 | ].with_traceback( # type: ignore[union-attr] 172 | _dependency_exc_info[2] 173 | ) 174 | 175 | if credential is None: 176 | if os.environ.get("AZURE_API_KEY") is None: 177 | credential = DefaultAzureCredential() 178 | else: 179 | credential = AzureKeyCredential(os.environ["AZURE_API_KEY"]) 180 | 181 | self.endpoint = endpoint 182 | self.api_version = api_version 183 | self.doc_intel_client = DocumentIntelligenceClient( 184 | endpoint=self.endpoint, 185 | api_version=self.api_version, 186 | credential=credential, 187 | ) 188 | 189 | def accepts( 190 | self, 191 | file_stream: BinaryIO, 192 | stream_info: StreamInfo, 193 | **kwargs: Any, # Options to pass to the converter 194 | ) -> bool: 195 | mimetype = (stream_info.mimetype or "").lower() 196 | extension = (stream_info.extension or "").lower() 197 | 198 | if extension in _get_file_extensions(self._file_types): 199 | return True 200 | 201 | for prefix in _get_mime_type_prefixes(self._file_types): 202 | if mimetype.startswith(prefix): 203 | return True 204 | 205 | return False 206 | 207 | def _analysis_features(self, stream_info: StreamInfo) -> List[str]: 208 | """ 209 | Helper needed to determine which analysis features to use. 210 | Certain document analysis features are not availiable for 211 | office filetypes (.xlsx, .pptx, .html, .docx) 212 | """ 213 | mimetype = (stream_info.mimetype or "").lower() 214 | extension = (stream_info.extension or "").lower() 215 | 216 | # Types that don't support ocr 217 | no_ocr_types = [ 218 | DocumentIntelligenceFileType.DOCX, 219 | DocumentIntelligenceFileType.PPTX, 220 | DocumentIntelligenceFileType.XLSX, 221 | DocumentIntelligenceFileType.HTML, 222 | ] 223 | 224 | if extension in _get_file_extensions(no_ocr_types): 225 | return [] 226 | 227 | for prefix in _get_mime_type_prefixes(no_ocr_types): 228 | if mimetype.startswith(prefix): 229 | return [] 230 | 231 | return [ 232 | DocumentAnalysisFeature.FORMULAS, # enable formula extraction 233 | DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR 234 | DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction 235 | ] 236 | 237 | def convert( 238 | self, 239 | file_stream: BinaryIO, 240 | stream_info: StreamInfo, 241 | **kwargs: Any, # Options to pass to the converter 242 | ) -> DocumentConverterResult: 243 | # Extract the text using Azure Document Intelligence 244 | poller = self.doc_intel_client.begin_analyze_document( 245 | model_id="prebuilt-layout", 246 | body=AnalyzeDocumentRequest(bytes_source=file_stream.read()), 247 | features=self._analysis_features(stream_info), 248 | output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed 249 | ) 250 | result: AnalyzeResult = poller.result() 251 | 252 | # remove comments from the markdown content generated by Doc Intelligence and append to markdown string 253 | markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL) 254 | return DocumentConverterResult(markdown=markdown_text) 255 | ``` -------------------------------------------------------------------------------- /packages/markitdown/tests/_test_vectors.py: -------------------------------------------------------------------------------- ```python 1 | import dataclasses 2 | from typing import List 3 | 4 | 5 | @dataclasses.dataclass(frozen=True, kw_only=True) 6 | class FileTestVector(object): 7 | filename: str 8 | mimetype: str | None 9 | charset: str | None 10 | url: str | None 11 | must_include: List[str] 12 | must_not_include: List[str] 13 | 14 | 15 | GENERAL_TEST_VECTORS = [ 16 | FileTestVector( 17 | filename="test.docx", 18 | mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", 19 | charset=None, 20 | url=None, 21 | must_include=[ 22 | "314b0a30-5b04-470b-b9f7-eed2c2bec74a", 23 | "49e168b7-d2ae-407f-a055-2167576f39a1", 24 | "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", 25 | "# Abstract", 26 | "# Introduction", 27 | "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 28 | "data:image/png;base64...", 29 | ], 30 | must_not_include=[ 31 | "data:image/png;base64,iVBORw0KGgoAAAANSU", 32 | ], 33 | ), 34 | FileTestVector( 35 | filename="test.xlsx", 36 | mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 37 | charset=None, 38 | url=None, 39 | must_include=[ 40 | "## 09060124-b5e7-4717-9d07-3c046eb", 41 | "6ff4173b-42a5-4784-9b19-f49caff4d93d", 42 | "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", 43 | ], 44 | must_not_include=[], 45 | ), 46 | FileTestVector( 47 | filename="test.xls", 48 | mimetype="application/vnd.ms-excel", 49 | charset=None, 50 | url=None, 51 | must_include=[ 52 | "## 09060124-b5e7-4717-9d07-3c046eb", 53 | "6ff4173b-42a5-4784-9b19-f49caff4d93d", 54 | "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", 55 | ], 56 | must_not_include=[], 57 | ), 58 | FileTestVector( 59 | filename="test.pptx", 60 | mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation", 61 | charset=None, 62 | url=None, 63 | must_include=[ 64 | "2cdda5c8-e50e-4db4-b5f0-9722a649f455", 65 | "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", 66 | "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", 67 | "1b92870d-e3b5-4e65-8153-919f4ff45592", 68 | "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 69 | "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title 70 | "2003", # chart value 71 | "", 72 | ], 73 | must_not_include=["data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE"], 74 | ), 75 | FileTestVector( 76 | filename="test_outlook_msg.msg", 77 | mimetype="application/vnd.ms-outlook", 78 | charset=None, 79 | url=None, 80 | must_include=[ 81 | "# Email Message", 82 | "**From:** [email protected]", 83 | "**To:** [email protected]", 84 | "**Subject:** Test Email Message", 85 | "## Content", 86 | "This is the body of the test email message", 87 | ], 88 | must_not_include=[], 89 | ), 90 | FileTestVector( 91 | filename="test.pdf", 92 | mimetype="application/pdf", 93 | charset=None, 94 | url=None, 95 | must_include=[ 96 | "While there is contemporaneous exploration of multi-agent approaches" 97 | ], 98 | must_not_include=[], 99 | ), 100 | FileTestVector( 101 | filename="test_blog.html", 102 | mimetype="text/html", 103 | charset="utf-8", 104 | url="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math", 105 | must_include=[ 106 | "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?", 107 | "an example where high cost can easily prevent a generic complex", 108 | ], 109 | must_not_include=[], 110 | ), 111 | FileTestVector( 112 | filename="test_wikipedia.html", 113 | mimetype="text/html", 114 | charset="utf-8", 115 | url="https://en.wikipedia.org/wiki/Microsoft", 116 | must_include=[ 117 | "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]", 118 | 'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")', 119 | ], 120 | must_not_include=[ 121 | "You are encouraged to create an account and log in", 122 | "154 languages", 123 | "move to sidebar", 124 | ], 125 | ), 126 | FileTestVector( 127 | filename="test_serp.html", 128 | mimetype="text/html", 129 | charset="utf-8", 130 | url="https://www.bing.com/search?q=microsoft+wikipedia", 131 | must_include=[ 132 | "](https://en.wikipedia.org/wiki/Microsoft", 133 | "Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond", 134 | "1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox", 135 | ], 136 | must_not_include=[ 137 | "https://www.bing.com/ck/a?!&&p=", 138 | "data:image/svg+xml,%3Csvg%20width%3D", 139 | ], 140 | ), 141 | FileTestVector( 142 | filename="test_mskanji.csv", 143 | mimetype="text/csv", 144 | charset="cp932", 145 | url=None, 146 | must_include=[ 147 | "| 名前 | 年齢 | 住所 |", 148 | "| --- | --- | --- |", 149 | "| 佐藤太郎 | 30 | 東京 |", 150 | "| 三木英子 | 25 | 大阪 |", 151 | "| 髙橋淳 | 35 | 名古屋 |", 152 | ], 153 | must_not_include=[], 154 | ), 155 | FileTestVector( 156 | filename="test.json", 157 | mimetype="application/json", 158 | charset="ascii", 159 | url=None, 160 | must_include=[ 161 | "5b64c88c-b3c3-4510-bcb8-da0b200602d8", 162 | "9700dc99-6685-40b4-9a3a-5e406dcb37f3", 163 | ], 164 | must_not_include=[], 165 | ), 166 | FileTestVector( 167 | filename="test_rss.xml", 168 | mimetype="text/xml", 169 | charset="utf-8", 170 | url=None, 171 | must_include=[ 172 | "# The Official Microsoft Blog", 173 | "## Ignite 2024: Why nearly 70% of the Fortune 500 now use Microsoft 365 Copilot", 174 | "In the case of AI, it is absolutely true that the industry is moving incredibly fast", 175 | ], 176 | must_not_include=["<rss", "<feed"], 177 | ), 178 | FileTestVector( 179 | filename="test_notebook.ipynb", 180 | mimetype="application/json", 181 | charset="ascii", 182 | url=None, 183 | must_include=[ 184 | "# Test Notebook", 185 | "```python", 186 | 'print("markitdown")', 187 | "```", 188 | "## Code Cell Below", 189 | ], 190 | must_not_include=[ 191 | "nbformat", 192 | "nbformat_minor", 193 | ], 194 | ), 195 | FileTestVector( 196 | filename="test_files.zip", 197 | mimetype="application/zip", 198 | charset=None, 199 | url=None, 200 | must_include=[ 201 | "314b0a30-5b04-470b-b9f7-eed2c2bec74a", 202 | "49e168b7-d2ae-407f-a055-2167576f39a1", 203 | "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", 204 | "# Abstract", 205 | "# Introduction", 206 | "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 207 | "2cdda5c8-e50e-4db4-b5f0-9722a649f455", 208 | "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", 209 | "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", 210 | "1b92870d-e3b5-4e65-8153-919f4ff45592", 211 | "## 09060124-b5e7-4717-9d07-3c046eb", 212 | "6ff4173b-42a5-4784-9b19-f49caff4d93d", 213 | "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0", 214 | "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]", 215 | 'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")', 216 | ], 217 | must_not_include=[], 218 | ), 219 | FileTestVector( 220 | filename="test.epub", 221 | mimetype="application/epub+zip", 222 | charset=None, 223 | url=None, 224 | must_include=[ 225 | "**Authors:** Test Author", 226 | "A test EPUB document for MarkItDown testing", 227 | "# Chapter 1: Test Content", 228 | "This is a **test** paragraph with some formatting", 229 | "* A bullet point", 230 | "* Another point", 231 | "# Chapter 2: More Content", 232 | "*different* style", 233 | "> This is a blockquote for testing", 234 | ], 235 | must_not_include=[], 236 | ), 237 | ] 238 | 239 | 240 | DATA_URI_TEST_VECTORS = [ 241 | FileTestVector( 242 | filename="test.docx", 243 | mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document", 244 | charset=None, 245 | url=None, 246 | must_include=[ 247 | "314b0a30-5b04-470b-b9f7-eed2c2bec74a", 248 | "49e168b7-d2ae-407f-a055-2167576f39a1", 249 | "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", 250 | "# Abstract", 251 | "# Introduction", 252 | "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 253 | "data:image/png;base64,iVBORw0KGgoAAAANSU", 254 | ], 255 | must_not_include=[ 256 | "data:image/png;base64...", 257 | ], 258 | ), 259 | FileTestVector( 260 | filename="test.pptx", 261 | mimetype="application/vnd.openxmlformats-officedocument.presentationml.presentation", 262 | charset=None, 263 | url=None, 264 | must_include=[ 265 | "2cdda5c8-e50e-4db4-b5f0-9722a649f455", 266 | "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", 267 | "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", 268 | "1b92870d-e3b5-4e65-8153-919f4ff45592", 269 | "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 270 | "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title 271 | "2003", # chart value 272 | "![This phrase of the caption is Human-written.]", # image caption 273 | "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQE", 274 | ], 275 | must_not_include=[ 276 | "", 277 | ], 278 | ), 279 | ] 280 | ``` -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converters/_pptx_converter.py: -------------------------------------------------------------------------------- ```python 1 | import sys 2 | import base64 3 | import os 4 | import io 5 | import re 6 | import html 7 | 8 | from typing import BinaryIO, Any 9 | from operator import attrgetter 10 | 11 | from ._html_converter import HtmlConverter 12 | from ._llm_caption import llm_caption 13 | from .._base_converter import DocumentConverter, DocumentConverterResult 14 | from .._stream_info import StreamInfo 15 | from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE 16 | 17 | # Try loading optional (but in this case, required) dependencies 18 | # Save reporting of any exceptions for later 19 | _dependency_exc_info = None 20 | try: 21 | import pptx 22 | except ImportError: 23 | # Preserve the error and stack trace for later 24 | _dependency_exc_info = sys.exc_info() 25 | 26 | 27 | ACCEPTED_MIME_TYPE_PREFIXES = [ 28 | "application/vnd.openxmlformats-officedocument.presentationml", 29 | ] 30 | 31 | ACCEPTED_FILE_EXTENSIONS = [".pptx"] 32 | 33 | 34 | class PptxConverter(DocumentConverter): 35 | """ 36 | Converts PPTX files to Markdown. Supports heading, tables and images with alt text. 37 | """ 38 | 39 | def __init__(self): 40 | super().__init__() 41 | self._html_converter = HtmlConverter() 42 | 43 | def accepts( 44 | self, 45 | file_stream: BinaryIO, 46 | stream_info: StreamInfo, 47 | **kwargs: Any, # Options to pass to the converter 48 | ) -> bool: 49 | mimetype = (stream_info.mimetype or "").lower() 50 | extension = (stream_info.extension or "").lower() 51 | 52 | if extension in ACCEPTED_FILE_EXTENSIONS: 53 | return True 54 | 55 | for prefix in ACCEPTED_MIME_TYPE_PREFIXES: 56 | if mimetype.startswith(prefix): 57 | return True 58 | 59 | return False 60 | 61 | def convert( 62 | self, 63 | file_stream: BinaryIO, 64 | stream_info: StreamInfo, 65 | **kwargs: Any, # Options to pass to the converter 66 | ) -> DocumentConverterResult: 67 | # Check the dependencies 68 | if _dependency_exc_info is not None: 69 | raise MissingDependencyException( 70 | MISSING_DEPENDENCY_MESSAGE.format( 71 | converter=type(self).__name__, 72 | extension=".pptx", 73 | feature="pptx", 74 | ) 75 | ) from _dependency_exc_info[ 76 | 1 77 | ].with_traceback( # type: ignore[union-attr] 78 | _dependency_exc_info[2] 79 | ) 80 | 81 | # Perform the conversion 82 | presentation = pptx.Presentation(file_stream) 83 | md_content = "" 84 | slide_num = 0 85 | for slide in presentation.slides: 86 | slide_num += 1 87 | 88 | md_content += f"\n\n<!-- Slide number: {slide_num} -->\n" 89 | 90 | title = slide.shapes.title 91 | 92 | def get_shape_content(shape, **kwargs): 93 | nonlocal md_content 94 | # Pictures 95 | if self._is_picture(shape): 96 | # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 97 | 98 | llm_description = "" 99 | alt_text = "" 100 | 101 | # Potentially generate a description using an LLM 102 | llm_client = kwargs.get("llm_client") 103 | llm_model = kwargs.get("llm_model") 104 | if llm_client is not None and llm_model is not None: 105 | # Prepare a file_stream and stream_info for the image data 106 | image_filename = shape.image.filename 107 | image_extension = None 108 | if image_filename: 109 | image_extension = os.path.splitext(image_filename)[1] 110 | image_stream_info = StreamInfo( 111 | mimetype=shape.image.content_type, 112 | extension=image_extension, 113 | filename=image_filename, 114 | ) 115 | 116 | image_stream = io.BytesIO(shape.image.blob) 117 | 118 | # Caption the image 119 | try: 120 | llm_description = llm_caption( 121 | image_stream, 122 | image_stream_info, 123 | client=llm_client, 124 | model=llm_model, 125 | prompt=kwargs.get("llm_prompt"), 126 | ) 127 | except Exception: 128 | # Unable to generate a description 129 | pass 130 | 131 | # Also grab any description embedded in the deck 132 | try: 133 | alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") 134 | except Exception: 135 | # Unable to get alt text 136 | pass 137 | 138 | # Prepare the alt, escaping any special characters 139 | alt_text = "\n".join([llm_description, alt_text]) or shape.name 140 | alt_text = re.sub(r"[\r\n\[\]]", " ", alt_text) 141 | alt_text = re.sub(r"\s+", " ", alt_text).strip() 142 | 143 | # If keep_data_uris is True, use base64 encoding for images 144 | if kwargs.get("keep_data_uris", False): 145 | blob = shape.image.blob 146 | content_type = shape.image.content_type or "image/png" 147 | b64_string = base64.b64encode(blob).decode("utf-8") 148 | md_content += f"\n\n" 149 | else: 150 | # A placeholder name 151 | filename = re.sub(r"\W", "", shape.name) + ".jpg" 152 | md_content += "\n\n" 153 | 154 | # Tables 155 | if self._is_table(shape): 156 | md_content += self._convert_table_to_markdown(shape.table, **kwargs) 157 | 158 | # Charts 159 | if shape.has_chart: 160 | md_content += self._convert_chart_to_markdown(shape.chart) 161 | 162 | # Text areas 163 | elif shape.has_text_frame: 164 | if shape == title: 165 | md_content += "# " + shape.text.lstrip() + "\n" 166 | else: 167 | md_content += shape.text + "\n" 168 | 169 | # Group Shapes 170 | if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: 171 | sorted_shapes = sorted( 172 | shape.shapes, 173 | key=lambda x: ( 174 | float("-inf") if not x.top else x.top, 175 | float("-inf") if not x.left else x.left, 176 | ), 177 | ) 178 | for subshape in sorted_shapes: 179 | get_shape_content(subshape, **kwargs) 180 | 181 | sorted_shapes = sorted( 182 | slide.shapes, 183 | key=lambda x: ( 184 | float("-inf") if not x.top else x.top, 185 | float("-inf") if not x.left else x.left, 186 | ), 187 | ) 188 | for shape in sorted_shapes: 189 | get_shape_content(shape, **kwargs) 190 | 191 | md_content = md_content.strip() 192 | 193 | if slide.has_notes_slide: 194 | md_content += "\n\n### Notes:\n" 195 | notes_frame = slide.notes_slide.notes_text_frame 196 | if notes_frame is not None: 197 | md_content += notes_frame.text 198 | md_content = md_content.strip() 199 | 200 | return DocumentConverterResult(markdown=md_content.strip()) 201 | 202 | def _is_picture(self, shape): 203 | if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: 204 | return True 205 | if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER: 206 | if hasattr(shape, "image"): 207 | return True 208 | return False 209 | 210 | def _is_table(self, shape): 211 | if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE: 212 | return True 213 | return False 214 | 215 | def _convert_table_to_markdown(self, table, **kwargs): 216 | # Write the table as HTML, then convert it to Markdown 217 | html_table = "<html><body><table>" 218 | first_row = True 219 | for row in table.rows: 220 | html_table += "<tr>" 221 | for cell in row.cells: 222 | if first_row: 223 | html_table += "<th>" + html.escape(cell.text) + "</th>" 224 | else: 225 | html_table += "<td>" + html.escape(cell.text) + "</td>" 226 | html_table += "</tr>" 227 | first_row = False 228 | html_table += "</table></body></html>" 229 | 230 | return ( 231 | self._html_converter.convert_string(html_table, **kwargs).markdown.strip() 232 | + "\n" 233 | ) 234 | 235 | def _convert_chart_to_markdown(self, chart): 236 | try: 237 | md = "\n\n### Chart" 238 | if chart.has_title: 239 | md += f": {chart.chart_title.text_frame.text}" 240 | md += "\n\n" 241 | data = [] 242 | category_names = [c.label for c in chart.plots[0].categories] 243 | series_names = [s.name for s in chart.series] 244 | data.append(["Category"] + series_names) 245 | 246 | for idx, category in enumerate(category_names): 247 | row = [category] 248 | for series in chart.series: 249 | row.append(series.values[idx]) 250 | data.append(row) 251 | 252 | markdown_table = [] 253 | for row in data: 254 | markdown_table.append("| " + " | ".join(map(str, row)) + " |") 255 | header = markdown_table[0] 256 | separator = "|" + "|".join(["---"] * len(data[0])) + "|" 257 | return md + "\n".join([header, separator] + markdown_table[1:]) 258 | except ValueError as e: 259 | # Handle the specific error for unsupported chart types 260 | if "unsupported plot type" in str(e): 261 | return "\n\n[unsupported chart]\n\n" 262 | except Exception: 263 | # Catch any other exceptions that might occur 264 | return "\n\n[unsupported chart]\n\n" 265 | ``` -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py: -------------------------------------------------------------------------------- ```python 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Office Math Markup Language (OMML) 5 | Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py 6 | On 25/03/2025 7 | """ 8 | 9 | from defusedxml import ElementTree as ET 10 | 11 | from .latex_dict import ( 12 | CHARS, 13 | CHR, 14 | CHR_BO, 15 | CHR_DEFAULT, 16 | POS, 17 | POS_DEFAULT, 18 | SUB, 19 | SUP, 20 | F, 21 | F_DEFAULT, 22 | T, 23 | FUNC, 24 | D, 25 | D_DEFAULT, 26 | RAD, 27 | RAD_DEFAULT, 28 | ARR, 29 | LIM_FUNC, 30 | LIM_TO, 31 | LIM_UPP, 32 | M, 33 | BRK, 34 | BLANK, 35 | BACKSLASH, 36 | ALN, 37 | FUNC_PLACE, 38 | ) 39 | 40 | OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}" 41 | 42 | 43 | def load(stream): 44 | tree = ET.parse(stream) 45 | for omath in tree.findall(OMML_NS + "oMath"): 46 | yield oMath2Latex(omath) 47 | 48 | 49 | def load_string(string): 50 | root = ET.fromstring(string) 51 | for omath in root.findall(OMML_NS + "oMath"): 52 | yield oMath2Latex(omath) 53 | 54 | 55 | def escape_latex(strs): 56 | last = None 57 | new_chr = [] 58 | strs = strs.replace(r"\\", "\\") 59 | for c in strs: 60 | if (c in CHARS) and (last != BACKSLASH): 61 | new_chr.append(BACKSLASH + c) 62 | else: 63 | new_chr.append(c) 64 | last = c 65 | return BLANK.join(new_chr) 66 | 67 | 68 | def get_val(key, default=None, store=CHR): 69 | if key is not None: 70 | return key if not store else store.get(key, key) 71 | else: 72 | return default 73 | 74 | 75 | class Tag2Method(object): 76 | def call_method(self, elm, stag=None): 77 | getmethod = self.tag2meth.get 78 | if stag is None: 79 | stag = elm.tag.replace(OMML_NS, "") 80 | method = getmethod(stag) 81 | if method: 82 | return method(self, elm) 83 | else: 84 | return None 85 | 86 | def process_children_list(self, elm, include=None): 87 | """ 88 | process children of the elm,return iterable 89 | """ 90 | for _e in list(elm): 91 | if OMML_NS not in _e.tag: 92 | continue 93 | stag = _e.tag.replace(OMML_NS, "") 94 | if include and (stag not in include): 95 | continue 96 | t = self.call_method(_e, stag=stag) 97 | if t is None: 98 | t = self.process_unknow(_e, stag) 99 | if t is None: 100 | continue 101 | yield (stag, t, _e) 102 | 103 | def process_children_dict(self, elm, include=None): 104 | """ 105 | process children of the elm,return dict 106 | """ 107 | latex_chars = dict() 108 | for stag, t, e in self.process_children_list(elm, include): 109 | latex_chars[stag] = t 110 | return latex_chars 111 | 112 | def process_children(self, elm, include=None): 113 | """ 114 | process children of the elm,return string 115 | """ 116 | return BLANK.join( 117 | ( 118 | t if not isinstance(t, Tag2Method) else str(t) 119 | for stag, t, e in self.process_children_list(elm, include) 120 | ) 121 | ) 122 | 123 | def process_unknow(self, elm, stag): 124 | return None 125 | 126 | 127 | class Pr(Tag2Method): 128 | text = "" 129 | 130 | __val_tags = ("chr", "pos", "begChr", "endChr", "type") 131 | 132 | __innerdict = None # can't use the __dict__ 133 | 134 | """ common properties of element""" 135 | 136 | def __init__(self, elm): 137 | self.__innerdict = {} 138 | self.text = self.process_children(elm) 139 | 140 | def __str__(self): 141 | return self.text 142 | 143 | def __unicode__(self): 144 | return self.__str__(self) 145 | 146 | def __getattr__(self, name): 147 | return self.__innerdict.get(name, None) 148 | 149 | def do_brk(self, elm): 150 | self.__innerdict["brk"] = BRK 151 | return BRK 152 | 153 | def do_common(self, elm): 154 | stag = elm.tag.replace(OMML_NS, "") 155 | if stag in self.__val_tags: 156 | t = elm.get("{0}val".format(OMML_NS)) 157 | self.__innerdict[stag] = t 158 | return None 159 | 160 | tag2meth = { 161 | "brk": do_brk, 162 | "chr": do_common, 163 | "pos": do_common, 164 | "begChr": do_common, 165 | "endChr": do_common, 166 | "type": do_common, 167 | } 168 | 169 | 170 | class oMath2Latex(Tag2Method): 171 | """ 172 | Convert oMath element of omml to latex 173 | """ 174 | 175 | _t_dict = T 176 | 177 | __direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e") 178 | 179 | def __init__(self, element): 180 | self._latex = self.process_children(element) 181 | 182 | def __str__(self): 183 | return self.latex 184 | 185 | def __unicode__(self): 186 | return self.__str__(self) 187 | 188 | def process_unknow(self, elm, stag): 189 | if stag in self.__direct_tags: 190 | return self.process_children(elm) 191 | elif stag[-2:] == "Pr": 192 | return Pr(elm) 193 | else: 194 | return None 195 | 196 | @property 197 | def latex(self): 198 | return self._latex 199 | 200 | def do_acc(self, elm): 201 | """ 202 | the accent function 203 | """ 204 | c_dict = self.process_children_dict(elm) 205 | latex_s = get_val( 206 | c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR 207 | ) 208 | return latex_s.format(c_dict["e"]) 209 | 210 | def do_bar(self, elm): 211 | """ 212 | the bar function 213 | """ 214 | c_dict = self.process_children_dict(elm) 215 | pr = c_dict["barPr"] 216 | latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS) 217 | return pr.text + latex_s.format(c_dict["e"]) 218 | 219 | def do_d(self, elm): 220 | """ 221 | the delimiter object 222 | """ 223 | c_dict = self.process_children_dict(elm) 224 | pr = c_dict["dPr"] 225 | null = D_DEFAULT.get("null") 226 | s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T) 227 | e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T) 228 | return pr.text + D.format( 229 | left=null if not s_val else escape_latex(s_val), 230 | text=c_dict["e"], 231 | right=null if not e_val else escape_latex(e_val), 232 | ) 233 | 234 | def do_spre(self, elm): 235 | """ 236 | the Pre-Sub-Superscript object -- Not support yet 237 | """ 238 | pass 239 | 240 | def do_sub(self, elm): 241 | text = self.process_children(elm) 242 | return SUB.format(text) 243 | 244 | def do_sup(self, elm): 245 | text = self.process_children(elm) 246 | return SUP.format(text) 247 | 248 | def do_f(self, elm): 249 | """ 250 | the fraction object 251 | """ 252 | c_dict = self.process_children_dict(elm) 253 | pr = c_dict["fPr"] 254 | latex_s = get_val(pr.type, default=F_DEFAULT, store=F) 255 | return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den")) 256 | 257 | def do_func(self, elm): 258 | """ 259 | the Function-Apply object (Examples:sin cos) 260 | """ 261 | c_dict = self.process_children_dict(elm) 262 | func_name = c_dict.get("fName") 263 | return func_name.replace(FUNC_PLACE, c_dict.get("e")) 264 | 265 | def do_fname(self, elm): 266 | """ 267 | the func name 268 | """ 269 | latex_chars = [] 270 | for stag, t, e in self.process_children_list(elm): 271 | if stag == "r": 272 | if FUNC.get(t): 273 | latex_chars.append(FUNC[t]) 274 | else: 275 | raise NotImplementedError("Not support func %s" % t) 276 | else: 277 | latex_chars.append(t) 278 | t = BLANK.join(latex_chars) 279 | return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this 280 | 281 | def do_groupchr(self, elm): 282 | """ 283 | the Group-Character object 284 | """ 285 | c_dict = self.process_children_dict(elm) 286 | pr = c_dict["groupChrPr"] 287 | latex_s = get_val(pr.chr) 288 | return pr.text + latex_s.format(c_dict["e"]) 289 | 290 | def do_rad(self, elm): 291 | """ 292 | the radical object 293 | """ 294 | c_dict = self.process_children_dict(elm) 295 | text = c_dict.get("e") 296 | deg_text = c_dict.get("deg") 297 | if deg_text: 298 | return RAD.format(deg=deg_text, text=text) 299 | else: 300 | return RAD_DEFAULT.format(text=text) 301 | 302 | def do_eqarr(self, elm): 303 | """ 304 | the Array object 305 | """ 306 | return ARR.format( 307 | text=BRK.join( 308 | [t for stag, t, e in self.process_children_list(elm, include=("e",))] 309 | ) 310 | ) 311 | 312 | def do_limlow(self, elm): 313 | """ 314 | the Lower-Limit object 315 | """ 316 | t_dict = self.process_children_dict(elm, include=("e", "lim")) 317 | latex_s = LIM_FUNC.get(t_dict["e"]) 318 | if not latex_s: 319 | raise NotImplementedError("Not support lim %s" % t_dict["e"]) 320 | else: 321 | return latex_s.format(lim=t_dict.get("lim")) 322 | 323 | def do_limupp(self, elm): 324 | """ 325 | the Upper-Limit object 326 | """ 327 | t_dict = self.process_children_dict(elm, include=("e", "lim")) 328 | return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e")) 329 | 330 | def do_lim(self, elm): 331 | """ 332 | the lower limit of the limLow object and the upper limit of the limUpp function 333 | """ 334 | return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1]) 335 | 336 | def do_m(self, elm): 337 | """ 338 | the Matrix object 339 | """ 340 | rows = [] 341 | for stag, t, e in self.process_children_list(elm): 342 | if stag == "mPr": 343 | pass 344 | elif stag == "mr": 345 | rows.append(t) 346 | return M.format(text=BRK.join(rows)) 347 | 348 | def do_mr(self, elm): 349 | """ 350 | a single row of the matrix m 351 | """ 352 | return ALN.join( 353 | [t for stag, t, e in self.process_children_list(elm, include=("e",))] 354 | ) 355 | 356 | def do_nary(self, elm): 357 | """ 358 | the n-ary object 359 | """ 360 | res = [] 361 | bo = "" 362 | for stag, t, e in self.process_children_list(elm): 363 | if stag == "naryPr": 364 | bo = get_val(t.chr, store=CHR_BO) 365 | else: 366 | res.append(t) 367 | return bo + BLANK.join(res) 368 | 369 | def do_r(self, elm): 370 | """ 371 | Get text from 'r' element,And try convert them to latex symbols 372 | @todo text style support , (sty) 373 | @todo \text (latex pure text support) 374 | """ 375 | _str = [] 376 | for s in elm.findtext("./{0}t".format(OMML_NS)): 377 | # s = s if isinstance(s,unicode) else unicode(s,'utf-8') 378 | _str.append(self._t_dict.get(s, s)) 379 | return escape_latex(BLANK.join(_str)) 380 | 381 | tag2meth = { 382 | "acc": do_acc, 383 | "r": do_r, 384 | "bar": do_bar, 385 | "sub": do_sub, 386 | "sup": do_sup, 387 | "f": do_f, 388 | "func": do_func, 389 | "fName": do_fname, 390 | "groupChr": do_groupchr, 391 | "d": do_d, 392 | "rad": do_rad, 393 | "eqArr": do_eqarr, 394 | "limLow": do_limlow, 395 | "limUpp": do_limupp, 396 | "lim": do_lim, 397 | "m": do_m, 398 | "mr": do_mr, 399 | "nary": do_nary, 400 | } 401 | ``` -------------------------------------------------------------------------------- /packages/markitdown/ThirdPartyNotices.md: -------------------------------------------------------------------------------- ```markdown 1 | # THIRD-PARTY SOFTWARE NOTICES AND INFORMATION 2 | 3 | **Do Not Translate or Localize** 4 | 5 | This project incorporates components from the projects listed below. The original copyright notices and the licenses 6 | under which MarkItDown received such components are set forth below. MarkItDown reserves all rights not expressly 7 | granted herein, whether by implication, estoppel or otherwise. 8 | 9 | 1.dwml (https://github.com/xiilei/dwml) 10 | 11 | dwml NOTICES AND INFORMATION BEGIN HERE 12 | 13 | ----------------------------------------- 14 | 15 | NOTE 1: What follows is a verbatim copy of dwml's LICENSE file, as it appeared on March 28th, 2025 - including 16 | placeholders for the copyright owner and year. 17 | 18 | NOTE 2: The Apache License, Version 2.0, requires that modifications to the dwml source code be documented. 19 | The following section summarizes these changes. The full details are available in the MarkItDown source code 20 | repository under PR #1160 (https://github.com/microsoft/markitdown/pull/1160) 21 | 22 | This project incorporates `dwml/latex_dict.py` and `dwml/omml.py` files without any additional logic modifications (which 23 | lives in `packages/markitdown/src/markitdown/converter_utils/docx/math` location). However, we have reformatted the code 24 | according to `black` code formatter. From `tests/docx.py` file, we have used `DOCXML_ROOT` XML namespaces and the rest of 25 | the file is not used. 26 | 27 | ----------------------------------------- 28 | 29 | Apache License 30 | Version 2.0, January 2004 31 | http://www.apache.org/licenses/ 32 | 33 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 34 | 35 | 1. Definitions. 36 | 37 | "License" shall mean the terms and conditions for use, reproduction, 38 | and distribution as defined by Sections 1 through 9 of this document. 39 | 40 | "Licensor" shall mean the copyright owner or entity authorized by 41 | the copyright owner that is granting the License. 42 | 43 | "Legal Entity" shall mean the union of the acting entity and all 44 | other entities that control, are controlled by, or are under common 45 | control with that entity. For the purposes of this definition, 46 | "control" means (i) the power, direct or indirect, to cause the 47 | direction or management of such entity, whether by contract or 48 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 49 | outstanding shares, or (iii) beneficial ownership of such entity. 50 | 51 | "You" (or "Your") shall mean an individual or Legal Entity 52 | exercising permissions granted by this License. 53 | 54 | "Source" form shall mean the preferred form for making modifications, 55 | including but not limited to software source code, documentation 56 | source, and configuration files. 57 | 58 | "Object" form shall mean any form resulting from mechanical 59 | transformation or translation of a Source form, including but 60 | not limited to compiled object code, generated documentation, 61 | and conversions to other media types. 62 | 63 | "Work" shall mean the work of authorship, whether in Source or 64 | Object form, made available under the License, as indicated by a 65 | copyright notice that is included in or attached to the work 66 | (an example is provided in the Appendix below). 67 | 68 | "Derivative Works" shall mean any work, whether in Source or Object 69 | form, that is based on (or derived from) the Work and for which the 70 | editorial revisions, annotations, elaborations, or other modifications 71 | represent, as a whole, an original work of authorship. For the purposes 72 | of this License, Derivative Works shall not include works that remain 73 | separable from, or merely link (or bind by name) to the interfaces of, 74 | the Work and Derivative Works thereof. 75 | 76 | "Contribution" shall mean any work of authorship, including 77 | the original version of the Work and any modifications or additions 78 | to that Work or Derivative Works thereof, that is intentionally 79 | submitted to Licensor for inclusion in the Work by the copyright owner 80 | or by an individual or Legal Entity authorized to submit on behalf of 81 | the copyright owner. For the purposes of this definition, "submitted" 82 | means any form of electronic, verbal, or written communication sent 83 | to the Licensor or its representatives, including but not limited to 84 | communication on electronic mailing lists, source code control systems, 85 | and issue tracking systems that are managed by, or on behalf of, the 86 | Licensor for the purpose of discussing and improving the Work, but 87 | excluding communication that is conspicuously marked or otherwise 88 | designated in writing by the copyright owner as "Not a Contribution." 89 | 90 | "Contributor" shall mean Licensor and any individual or Legal Entity 91 | on behalf of whom a Contribution has been received by Licensor and 92 | subsequently incorporated within the Work. 93 | 94 | 2. Grant of Copyright License. Subject to the terms and conditions of 95 | this License, each Contributor hereby grants to You a perpetual, 96 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 97 | copyright license to reproduce, prepare Derivative Works of, 98 | publicly display, publicly perform, sublicense, and distribute the 99 | Work and such Derivative Works in Source or Object form. 100 | 101 | 3. Grant of Patent License. Subject to the terms and conditions of 102 | this License, each Contributor hereby grants to You a perpetual, 103 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 104 | (except as stated in this section) patent license to make, have made, 105 | use, offer to sell, sell, import, and otherwise transfer the Work, 106 | where such license applies only to those patent claims licensable 107 | by such Contributor that are necessarily infringed by their 108 | Contribution(s) alone or by combination of their Contribution(s) 109 | with the Work to which such Contribution(s) was submitted. If You 110 | institute patent litigation against any entity (including a 111 | cross-claim or counterclaim in a lawsuit) alleging that the Work 112 | or a Contribution incorporated within the Work constitutes direct 113 | or contributory patent infringement, then any patent licenses 114 | granted to You under this License for that Work shall terminate 115 | as of the date such litigation is filed. 116 | 117 | 4. Redistribution. You may reproduce and distribute copies of the 118 | Work or Derivative Works thereof in any medium, with or without 119 | modifications, and in Source or Object form, provided that You 120 | meet the following conditions: 121 | 122 | (a) You must give any other recipients of the Work or 123 | Derivative Works a copy of this License; and 124 | 125 | (b) You must cause any modified files to carry prominent notices 126 | stating that You changed the files; and 127 | 128 | (c) You must retain, in the Source form of any Derivative Works 129 | that You distribute, all copyright, patent, trademark, and 130 | attribution notices from the Source form of the Work, 131 | excluding those notices that do not pertain to any part of 132 | the Derivative Works; and 133 | 134 | (d) If the Work includes a "NOTICE" text file as part of its 135 | distribution, then any Derivative Works that You distribute must 136 | include a readable copy of the attribution notices contained 137 | within such NOTICE file, excluding those notices that do not 138 | pertain to any part of the Derivative Works, in at least one 139 | of the following places: within a NOTICE text file distributed 140 | as part of the Derivative Works; within the Source form or 141 | documentation, if provided along with the Derivative Works; or, 142 | within a display generated by the Derivative Works, if and 143 | wherever such third-party notices normally appear. The contents 144 | of the NOTICE file are for informational purposes only and 145 | do not modify the License. You may add Your own attribution 146 | notices within Derivative Works that You distribute, alongside 147 | or as an addendum to the NOTICE text from the Work, provided 148 | that such additional attribution notices cannot be construed 149 | as modifying the License. 150 | 151 | You may add Your own copyright statement to Your modifications and 152 | may provide additional or different license terms and conditions 153 | for use, reproduction, or distribution of Your modifications, or 154 | for any such Derivative Works as a whole, provided Your use, 155 | reproduction, and distribution of the Work otherwise complies with 156 | the conditions stated in this License. 157 | 158 | 5. Submission of Contributions. Unless You explicitly state otherwise, 159 | any Contribution intentionally submitted for inclusion in the Work 160 | by You to the Licensor shall be under the terms and conditions of 161 | this License, without any additional terms or conditions. 162 | Notwithstanding the above, nothing herein shall supersede or modify 163 | the terms of any separate license agreement you may have executed 164 | with Licensor regarding such Contributions. 165 | 166 | 6. Trademarks. This License does not grant permission to use the trade 167 | names, trademarks, service marks, or product names of the Licensor, 168 | except as required for reasonable and customary use in describing the 169 | origin of the Work and reproducing the content of the NOTICE file. 170 | 171 | 7. Disclaimer of Warranty. Unless required by applicable law or 172 | agreed to in writing, Licensor provides the Work (and each 173 | Contributor provides its Contributions) on an "AS IS" BASIS, 174 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 175 | implied, including, without limitation, any warranties or conditions 176 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 177 | PARTICULAR PURPOSE. You are solely responsible for determining the 178 | appropriateness of using or redistributing the Work and assume any 179 | risks associated with Your exercise of permissions under this License. 180 | 181 | 8. Limitation of Liability. In no event and under no legal theory, 182 | whether in tort (including negligence), contract, or otherwise, 183 | unless required by applicable law (such as deliberate and grossly 184 | negligent acts) or agreed to in writing, shall any Contributor be 185 | liable to You for damages, including any direct, indirect, special, 186 | incidental, or consequential damages of any character arising as a 187 | result of this License or out of the use or inability to use the 188 | Work (including but not limited to damages for loss of goodwill, 189 | work stoppage, computer failure or malfunction, or any and all 190 | other commercial damages or losses), even if such Contributor 191 | has been advised of the possibility of such damages. 192 | 193 | 9. Accepting Warranty or Additional Liability. While redistributing 194 | the Work or Derivative Works thereof, You may choose to offer, 195 | and charge a fee for, acceptance of support, warranty, indemnity, 196 | or other liability obligations and/or rights consistent with this 197 | License. However, in accepting such obligations, You may act only 198 | on Your own behalf and on Your sole responsibility, not on behalf 199 | of any other Contributor, and only if You agree to indemnify, 200 | defend, and hold each Contributor harmless for any liability 201 | incurred by, or claims asserted against, such Contributor by reason 202 | of your accepting any such warranty or additional liability. 203 | 204 | END OF TERMS AND CONDITIONS 205 | 206 | APPENDIX: How to apply the Apache License to your work. 207 | 208 | To apply the Apache License to your work, attach the following 209 | boilerplate notice, with the fields enclosed by brackets "{}" 210 | replaced with your own identifying information. (Don't include 211 | the brackets!) The text should be enclosed in the appropriate 212 | comment syntax for the file format. We also recommend that a 213 | file or class name and description of purpose be included on the 214 | same "printed page" as the copyright notice for easier 215 | identification within third-party archives. 216 | 217 | Copyright {yyyy} {name of copyright owner} 218 | 219 | Licensed under the Apache License, Version 2.0 (the "License"); 220 | you may not use this file except in compliance with the License. 221 | You may obtain a copy of the License at 222 | 223 | http://www.apache.org/licenses/LICENSE-2.0 224 | 225 | Unless required by applicable law or agreed to in writing, software 226 | distributed under the License is distributed on an "AS IS" BASIS, 227 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 228 | See the License for the specific language governing permissions and 229 | limitations under the License. 230 | 231 | ----------------------------------------- 232 | END OF dwml NOTICES AND INFORMATION ``` -------------------------------------------------------------------------------- /packages/markitdown/tests/test_module_misc.py: -------------------------------------------------------------------------------- ```python 1 | #!/usr/bin/env python3 -m pytest 2 | import io 3 | import os 4 | import re 5 | import shutil 6 | import pytest 7 | from unittest.mock import MagicMock 8 | 9 | from markitdown._uri_utils import parse_data_uri, file_uri_to_path 10 | 11 | from markitdown import ( 12 | MarkItDown, 13 | UnsupportedFormatException, 14 | FileConversionException, 15 | StreamInfo, 16 | ) 17 | 18 | # This file contains module tests that are not directly tested by the FileTestVectors. 19 | # This includes things like helper functions and runtime conversion options 20 | # (e.g., LLM clients, exiftool path, transcription services, etc.) 21 | 22 | skip_remote = ( 23 | True if os.environ.get("GITHUB_ACTIONS") else False 24 | ) # Don't run these tests in CI 25 | 26 | 27 | # Don't run the llm tests without a key and the client library 28 | skip_llm = False if os.environ.get("OPENAI_API_KEY") else True 29 | try: 30 | import openai 31 | except ModuleNotFoundError: 32 | skip_llm = True 33 | 34 | # Skip exiftool tests if not installed 35 | skip_exiftool = shutil.which("exiftool") is None 36 | 37 | TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") 38 | 39 | JPG_TEST_EXIFTOOL = { 40 | "Author": "AutoGen Authors", 41 | "Title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 42 | "Description": "AutoGen enables diverse LLM-based applications", 43 | "ImageSize": "1615x1967", 44 | "DateTimeOriginal": "2024:03:14 22:10:00", 45 | } 46 | 47 | MP3_TEST_EXIFTOOL = { 48 | "Title": "f67a499e-a7d0-4ca3-a49b-358bd934ae3e", 49 | "Artist": "Artist Name Test String", 50 | "Album": "Album Name Test String", 51 | "SampleRate": "48000", 52 | } 53 | 54 | PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf" 55 | PDF_TEST_STRINGS = [ 56 | "While there is contemporaneous exploration of multi-agent approaches" 57 | ] 58 | 59 | YOUTUBE_TEST_URL = "https://www.youtube.com/watch?v=V2qZ_lgxTzg" 60 | YOUTUBE_TEST_STRINGS = [ 61 | "## AutoGen FULL Tutorial with Python (Step-By-Step)", 62 | "This is an intermediate tutorial for installing and using AutoGen locally", 63 | "PT15M4S", 64 | "the model we're going to be using today is GPT 3.5 turbo", # From the transcript 65 | ] 66 | 67 | DOCX_COMMENT_TEST_STRINGS = [ 68 | "314b0a30-5b04-470b-b9f7-eed2c2bec74a", 69 | "49e168b7-d2ae-407f-a055-2167576f39a1", 70 | "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f", 71 | "# Abstract", 72 | "# Introduction", 73 | "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 74 | "This is a test comment. 12df-321a", 75 | "Yet another comment in the doc. 55yiyi-asd09", 76 | ] 77 | 78 | BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" 79 | BLOG_TEST_STRINGS = [ 80 | "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?", 81 | "an example where high cost can easily prevent a generic complex", 82 | ] 83 | 84 | LLM_TEST_STRINGS = [ 85 | "5bda1dd6", 86 | ] 87 | 88 | PPTX_TEST_STRINGS = [ 89 | "2cdda5c8-e50e-4db4-b5f0-9722a649f455", 90 | "04191ea8-5c73-4215-a1d3-1cfb43aaaf12", 91 | "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a", 92 | "1b92870d-e3b5-4e65-8153-919f4ff45592", 93 | "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 94 | "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f", # chart title 95 | "2003", # chart value 96 | ] 97 | 98 | 99 | # --- Helper Functions --- 100 | def validate_strings(result, expected_strings, exclude_strings=None): 101 | """Validate presence or absence of specific strings.""" 102 | text_content = result.text_content.replace("\\", "") 103 | for string in expected_strings: 104 | assert string in text_content 105 | if exclude_strings: 106 | for string in exclude_strings: 107 | assert string not in text_content 108 | 109 | 110 | def test_stream_info_operations() -> None: 111 | """Test operations performed on StreamInfo objects.""" 112 | 113 | stream_info_original = StreamInfo( 114 | mimetype="mimetype.1", 115 | extension="extension.1", 116 | charset="charset.1", 117 | filename="filename.1", 118 | local_path="local_path.1", 119 | url="url.1", 120 | ) 121 | 122 | # Check updating all attributes by keyword 123 | keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"] 124 | for keyword in keywords: 125 | updated_stream_info = stream_info_original.copy_and_update( 126 | **{keyword: f"{keyword}.2"} 127 | ) 128 | 129 | # Make sure the targted attribute is updated 130 | assert getattr(updated_stream_info, keyword) == f"{keyword}.2" 131 | 132 | # Make sure the other attributes are unchanged 133 | for k in keywords: 134 | if k != keyword: 135 | assert getattr(stream_info_original, k) == getattr( 136 | updated_stream_info, k 137 | ) 138 | 139 | # Check updating all attributes by passing a new StreamInfo object 140 | keywords = ["mimetype", "extension", "charset", "filename", "local_path", "url"] 141 | for keyword in keywords: 142 | updated_stream_info = stream_info_original.copy_and_update( 143 | StreamInfo(**{keyword: f"{keyword}.2"}) 144 | ) 145 | 146 | # Make sure the targted attribute is updated 147 | assert getattr(updated_stream_info, keyword) == f"{keyword}.2" 148 | 149 | # Make sure the other attributes are unchanged 150 | for k in keywords: 151 | if k != keyword: 152 | assert getattr(stream_info_original, k) == getattr( 153 | updated_stream_info, k 154 | ) 155 | 156 | # Check mixing and matching 157 | updated_stream_info = stream_info_original.copy_and_update( 158 | StreamInfo(extension="extension.2", filename="filename.2"), 159 | mimetype="mimetype.3", 160 | charset="charset.3", 161 | ) 162 | assert updated_stream_info.extension == "extension.2" 163 | assert updated_stream_info.filename == "filename.2" 164 | assert updated_stream_info.mimetype == "mimetype.3" 165 | assert updated_stream_info.charset == "charset.3" 166 | assert updated_stream_info.local_path == "local_path.1" 167 | assert updated_stream_info.url == "url.1" 168 | 169 | # Check multiple StreamInfo objects 170 | updated_stream_info = stream_info_original.copy_and_update( 171 | StreamInfo(extension="extension.4", filename="filename.5"), 172 | StreamInfo(mimetype="mimetype.6", charset="charset.7"), 173 | ) 174 | assert updated_stream_info.extension == "extension.4" 175 | assert updated_stream_info.filename == "filename.5" 176 | assert updated_stream_info.mimetype == "mimetype.6" 177 | assert updated_stream_info.charset == "charset.7" 178 | assert updated_stream_info.local_path == "local_path.1" 179 | assert updated_stream_info.url == "url.1" 180 | 181 | 182 | def test_data_uris() -> None: 183 | # Test basic parsing of data URIs 184 | data_uri = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==" 185 | mime_type, attributes, data = parse_data_uri(data_uri) 186 | assert mime_type == "text/plain" 187 | assert len(attributes) == 0 188 | assert data == b"Hello, World!" 189 | 190 | data_uri = "data:base64,SGVsbG8sIFdvcmxkIQ==" 191 | mime_type, attributes, data = parse_data_uri(data_uri) 192 | assert mime_type is None 193 | assert len(attributes) == 0 194 | assert data == b"Hello, World!" 195 | 196 | data_uri = "data:text/plain;charset=utf-8;base64,SGVsbG8sIFdvcmxkIQ==" 197 | mime_type, attributes, data = parse_data_uri(data_uri) 198 | assert mime_type == "text/plain" 199 | assert len(attributes) == 1 200 | assert attributes["charset"] == "utf-8" 201 | assert data == b"Hello, World!" 202 | 203 | data_uri = "data:,Hello%2C%20World%21" 204 | mime_type, attributes, data = parse_data_uri(data_uri) 205 | assert mime_type is None 206 | assert len(attributes) == 0 207 | assert data == b"Hello, World!" 208 | 209 | data_uri = "data:text/plain,Hello%2C%20World%21" 210 | mime_type, attributes, data = parse_data_uri(data_uri) 211 | assert mime_type == "text/plain" 212 | assert len(attributes) == 0 213 | assert data == b"Hello, World!" 214 | 215 | data_uri = "data:text/plain;charset=utf-8,Hello%2C%20World%21" 216 | mime_type, attributes, data = parse_data_uri(data_uri) 217 | assert mime_type == "text/plain" 218 | assert len(attributes) == 1 219 | assert attributes["charset"] == "utf-8" 220 | assert data == b"Hello, World!" 221 | 222 | 223 | def test_file_uris() -> None: 224 | # Test file URI with an empty host 225 | file_uri = "file:///path/to/file.txt" 226 | netloc, path = file_uri_to_path(file_uri) 227 | assert netloc is None 228 | assert path == "/path/to/file.txt" 229 | 230 | # Test file URI with no host 231 | file_uri = "file:/path/to/file.txt" 232 | netloc, path = file_uri_to_path(file_uri) 233 | assert netloc is None 234 | assert path == "/path/to/file.txt" 235 | 236 | # Test file URI with localhost 237 | file_uri = "file://localhost/path/to/file.txt" 238 | netloc, path = file_uri_to_path(file_uri) 239 | assert netloc == "localhost" 240 | assert path == "/path/to/file.txt" 241 | 242 | # Test file URI with query parameters 243 | file_uri = "file:///path/to/file.txt?param=value" 244 | netloc, path = file_uri_to_path(file_uri) 245 | assert netloc is None 246 | assert path == "/path/to/file.txt" 247 | 248 | # Test file URI with fragment 249 | file_uri = "file:///path/to/file.txt#fragment" 250 | netloc, path = file_uri_to_path(file_uri) 251 | assert netloc is None 252 | assert path == "/path/to/file.txt" 253 | 254 | 255 | def test_docx_comments() -> None: 256 | # Test DOCX processing, with comments and setting style_map on init 257 | markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") 258 | result = markitdown_with_style_map.convert( 259 | os.path.join(TEST_FILES_DIR, "test_with_comment.docx") 260 | ) 261 | validate_strings(result, DOCX_COMMENT_TEST_STRINGS) 262 | 263 | 264 | def test_docx_equations() -> None: 265 | markitdown = MarkItDown() 266 | docx_file = os.path.join(TEST_FILES_DIR, "equations.docx") 267 | result = markitdown.convert(docx_file) 268 | 269 | # Check for inline equation m=1 (wrapped with single $) is present 270 | assert "$m=1$" in result.text_content, "Inline equation $m=1$ not found" 271 | 272 | # Find block equations wrapped with double $$ and check if they are present 273 | block_equations = re.findall(r"\$\$(.+?)\$\$", result.text_content) 274 | assert block_equations, "No block equations found in the document." 275 | 276 | 277 | def test_input_as_strings() -> None: 278 | markitdown = MarkItDown() 279 | 280 | # Test input from a stream 281 | input_data = b"<html><body><h1>Test</h1></body></html>" 282 | result = markitdown.convert_stream(io.BytesIO(input_data)) 283 | assert "# Test" in result.text_content 284 | 285 | # Test input with leading blank characters 286 | input_data = b" \n\n\n<html><body><h1>Test</h1></body></html>" 287 | result = markitdown.convert_stream(io.BytesIO(input_data)) 288 | assert "# Test" in result.text_content 289 | 290 | 291 | @pytest.mark.skipif( 292 | skip_remote, 293 | reason="do not run tests that query external urls", 294 | ) 295 | def test_markitdown_remote() -> None: 296 | markitdown = MarkItDown() 297 | 298 | # By URL 299 | result = markitdown.convert(PDF_TEST_URL) 300 | for test_string in PDF_TEST_STRINGS: 301 | assert test_string in result.text_content 302 | 303 | # Youtube 304 | result = markitdown.convert(YOUTUBE_TEST_URL) 305 | for test_string in YOUTUBE_TEST_STRINGS: 306 | assert test_string in result.text_content 307 | 308 | 309 | @pytest.mark.skipif( 310 | skip_remote, 311 | reason="do not run remotely run speech transcription tests", 312 | ) 313 | def test_speech_transcription() -> None: 314 | markitdown = MarkItDown() 315 | 316 | # Test WAV files, MP3 and M4A files 317 | for file_name in ["test.wav", "test.mp3", "test.m4a"]: 318 | result = markitdown.convert(os.path.join(TEST_FILES_DIR, file_name)) 319 | result_lower = result.text_content.lower() 320 | assert ( 321 | ("1" in result_lower or "one" in result_lower) 322 | and ("2" in result_lower or "two" in result_lower) 323 | and ("3" in result_lower or "three" in result_lower) 324 | and ("4" in result_lower or "four" in result_lower) 325 | and ("5" in result_lower or "five" in result_lower) 326 | ) 327 | 328 | 329 | def test_exceptions() -> None: 330 | # Check that an exception is raised when trying to convert an unsupported format 331 | markitdown = MarkItDown() 332 | with pytest.raises(UnsupportedFormatException): 333 | markitdown.convert(os.path.join(TEST_FILES_DIR, "random.bin")) 334 | 335 | # Check that an exception is raised when trying to convert a file that is corrupted 336 | with pytest.raises(FileConversionException) as exc_info: 337 | markitdown.convert( 338 | os.path.join(TEST_FILES_DIR, "random.bin"), file_extension=".pptx" 339 | ) 340 | assert len(exc_info.value.attempts) == 1 341 | assert type(exc_info.value.attempts[0].converter).__name__ == "PptxConverter" 342 | 343 | 344 | @pytest.mark.skipif( 345 | skip_exiftool, 346 | reason="do not run if exiftool is not installed", 347 | ) 348 | def test_markitdown_exiftool() -> None: 349 | which_exiftool = shutil.which("exiftool") 350 | assert which_exiftool is not None 351 | 352 | # Test explicitly setting the location of exiftool 353 | markitdown = MarkItDown(exiftool_path=which_exiftool) 354 | result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) 355 | for key in JPG_TEST_EXIFTOOL: 356 | target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" 357 | assert target in result.text_content 358 | 359 | # Test setting the exiftool path through an environment variable 360 | os.environ["EXIFTOOL_PATH"] = which_exiftool 361 | markitdown = MarkItDown() 362 | result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg")) 363 | for key in JPG_TEST_EXIFTOOL: 364 | target = f"{key}: {JPG_TEST_EXIFTOOL[key]}" 365 | assert target in result.text_content 366 | 367 | # Test some other media types 368 | result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.mp3")) 369 | for key in MP3_TEST_EXIFTOOL: 370 | target = f"{key}: {MP3_TEST_EXIFTOOL[key]}" 371 | assert target in result.text_content 372 | 373 | 374 | def test_markitdown_llm_parameters() -> None: 375 | """Test that LLM parameters are correctly passed to the client.""" 376 | mock_client = MagicMock() 377 | mock_response = MagicMock() 378 | mock_response.choices = [ 379 | MagicMock( 380 | message=MagicMock( 381 | content="Test caption with red circle and blue square 5bda1dd6" 382 | ) 383 | ) 384 | ] 385 | mock_client.chat.completions.create.return_value = mock_response 386 | 387 | test_prompt = "You are a professional test prompt." 388 | markitdown = MarkItDown( 389 | llm_client=mock_client, llm_model="gpt-4o", llm_prompt=test_prompt 390 | ) 391 | 392 | # Test image file 393 | markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg")) 394 | 395 | # Verify the prompt was passed to the OpenAI API 396 | assert mock_client.chat.completions.create.called 397 | call_args = mock_client.chat.completions.create.call_args 398 | messages = call_args[1]["messages"] 399 | assert len(messages) == 1 400 | assert messages[0]["content"][0]["text"] == test_prompt 401 | 402 | # Reset the mock for the next test 403 | mock_client.chat.completions.create.reset_mock() 404 | 405 | # TODO: may only use one test after the llm caption method duplicate has been removed: 406 | # https://github.com/microsoft/markitdown/pull/1254 407 | # Test PPTX file 408 | markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) 409 | 410 | # Verify the prompt was passed to the OpenAI API for PPTX images too 411 | assert mock_client.chat.completions.create.called 412 | call_args = mock_client.chat.completions.create.call_args 413 | messages = call_args[1]["messages"] 414 | assert len(messages) == 1 415 | assert messages[0]["content"][0]["text"] == test_prompt 416 | 417 | 418 | @pytest.mark.skipif( 419 | skip_llm, 420 | reason="do not run llm tests without a key", 421 | ) 422 | def test_markitdown_llm() -> None: 423 | client = openai.OpenAI() 424 | markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o") 425 | 426 | result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg")) 427 | for test_string in LLM_TEST_STRINGS: 428 | assert test_string in result.text_content 429 | 430 | # This is not super precise. It would also accept "red square", "blue circle", 431 | # "the square is not blue", etc. But it's sufficient for this test. 432 | for test_string in ["red", "circle", "blue", "square"]: 433 | assert test_string in result.text_content.lower() 434 | 435 | # Images embedded in PPTX files 436 | result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) 437 | # LLM Captions are included 438 | for test_string in LLM_TEST_STRINGS: 439 | assert test_string in result.text_content 440 | # Standard alt text is included 441 | validate_strings(result, PPTX_TEST_STRINGS) 442 | 443 | 444 | if __name__ == "__main__": 445 | """Runs this file's tests from the command line.""" 446 | for test in [ 447 | test_stream_info_operations, 448 | test_data_uris, 449 | test_file_uris, 450 | test_docx_comments, 451 | test_input_as_strings, 452 | test_markitdown_remote, 453 | test_speech_transcription, 454 | test_exceptions, 455 | test_markitdown_exiftool, 456 | test_markitdown_llm_parameters, 457 | test_markitdown_llm, 458 | ]: 459 | print(f"Running {test.__name__}...", end="") 460 | test() 461 | print("OK") 462 | print("All tests passed!") 463 | ``` -------------------------------------------------------------------------------- /packages/markitdown/tests/test_files/test_blog.html: -------------------------------------------------------------------------------- ```html 1 | <!doctype html> 2 | <html lang="en" dir="ltr" class="blog-wrapper blog-post-page plugin-blog plugin-id-default" data-has-hydrated="false"> 3 | <head> 4 | <meta charset="UTF-8"> 5 | <meta name="generator" content="Docusaurus v3.1.1"> 6 | <title data-rh="true">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docusaurus_tag" content="default"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docsearch:docusaurus_tag" content="default"><meta data-rh="true" property="og:title" content="Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen"><meta data-rh="true" name="description" content="level 2 algebra"><meta data-rh="true" property="og:description" content="level 2 algebra"><meta data-rh="true" property="og:type" content="article"><meta data-rh="true" property="article:published_time" content="2023-04-21T00:00:00.000Z"><meta data-rh="true" property="article:author" content="https://www.linkedin.com/in/chi-wang-49b15b16/"><meta data-rh="true" property="article:tag" content="LLM,GPT,research"><link data-rh="true" rel="icon" href="/autogen/img/ag.ico"><link data-rh="true" rel="canonical" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"><link data-rh="true" rel="alternate" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" hreflang="en"><link data-rh="true" rel="alternate" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" hreflang="x-default"><link rel="alternate" type="application/rss+xml" href="/autogen/blog/rss.xml" title="AutoGen RSS Feed"> 7 | <link rel="alternate" type="application/atom+xml" href="/autogen/blog/atom.xml" title="AutoGen Atom Feed"> 8 | 9 | 10 | 11 | 12 | 13 | 14 | <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-Um5gpz1odJg5Z4HAmzPtgZKdTBHZdw8S29IecapCSB31ligYPhHQZMIlWLYQGVoc" crossorigin="anonymous"> 15 | <script src="/autogen/js/custom.js" async defer="defer"></script><link rel="stylesheet" href="/autogen/assets/css/styles.ca10f300.css"> 16 | <script src="/autogen/assets/js/runtime~main.83ab9fec.js" defer="defer"></script> 17 | <script src="/autogen/assets/js/main.5d28c826.js" defer="defer"></script> 18 | </head> 19 | <body class="navigation-with-keyboard"> 20 | <script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return localStorage.getItem("theme")}catch(t){}}();t(null!==e?e:"light")}(),function(){try{const a=new URLSearchParams(window.location.search).entries();for(var[t,e]of a)if(t.startsWith("docusaurus-data-")){var n=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(n,e)}}catch(t){}}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="announcementBar_mb4j" style="background-color:#fafbfc;color:#091E42" role="banner"><div class="announcementBarPlaceholder_vyr4"></div><div class="content_knG7 announcementBarContent_xLdY">What's new in AutoGen? Read <a href="/autogen/blog/2024/03/03/AutoGen-Update">this blog</a> for an overview of updates</div><button type="button" aria-label="Close" class="clean-btn close closeButton_CVFx announcementBarClose_gvF7"><svg viewBox="0 0 15 15" width="14" height="14"><g stroke="currentColor" stroke-width="3.1"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/autogen/"><div class="navbar__logo"><img src="/autogen/img/ag.svg" alt="AutoGen" class="themedComponent_mlkZ themedComponent--light_NVdE"><img src="/autogen/img/ag.svg" alt="AutoGen" class="themedComponent_mlkZ themedComponent--dark_xIcU"></div><b class="navbar__title text--truncate">AutoGen</b></a><a class="navbar__item navbar__link" href="/autogen/docs/Getting-Started">Docs</a><a class="navbar__item navbar__link" href="/autogen/docs/reference/agentchat/conversable_agent">API</a><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/autogen/blog">Blog</a><a class="navbar__item navbar__link" href="/autogen/docs/FAQ">FAQ</a><a class="navbar__item navbar__link" href="/autogen/docs/Examples">Examples</a><a class="navbar__item navbar__link" href="/autogen/docs/notebooks">Notebooks</a><a class="navbar__item navbar__link" href="/autogen/docs/Gallery">Gallery</a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Other Languages</a><ul class="dropdown__menu"><li><a href="https://microsoft.github.io/autogen-for-net/" target="_blank" rel="noopener noreferrer" class="dropdown__link">Dotnet<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><a href="https://github.com/microsoft/autogen" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="Switch between dark and light mode (currently light mode)" aria-label="Switch between dark and light mode (currently light mode)" aria-live="polite"><svg viewBox="0 0 24 24" width="24" height="24" class="lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" class="darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg></button></div><div class="navbarSearchContainer_Bca1"><div class="navbar__search searchBarContainer_NW3z"><input placeholder="Search" aria-label="Search" class="navbar__search-input"><div class="loadingRing_RJI3 searchBarLoadingRing_YnHq"><div></div><div></div><div></div><div></div></div><div class="searchHintContainer_Pkmr"><kbd class="searchHint_iIMx">ctrl</kbd><kbd class="searchHint_iIMx">K</kbd></div></div></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0"><div class="container margin-vert--lg"><div class="row"><aside class="col col--3"><nav class="sidebar_re4s thin-scrollbar" aria-label="Blog recent posts navigation"><div class="sidebarItemTitle_pO2u margin-bottom--md">Recent posts</div><ul class="sidebarItemList_Yudw clean-list"><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/03/03/AutoGen-Update">What's New in AutoGen?</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/29/StateFlow">StateFlow - Build LLM Workflows with Customized State-Oriented Transition Function in GroupChat</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/11/FSM-GroupChat">FSM Group Chat -- User-specified agent transitions</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/02/AutoAnny">Anny: Assisting AutoGen Devs Via AutoGen</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/26/Custom-Models">AutoGen with Custom Models: Empowering Users to Use Their Own Inference Mechanism</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/25/AutoGenBench">AutoGenBench -- A Tool for Measuring and Evaluating AutoGen Agents</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/23/Code-execution-in-docker">Code execution is now by default inside docker container</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/29/AgentDescriptions">All About Agent Descriptions</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/23/AgentOptimizer">AgentOptimizer - An Agentic Way to Train Your LLM Agent</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/01/AutoGenStudio">AutoGen Studio: Interactively Explore Multi-Agent Workflows</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/26/Agent-AutoBuild">Agent AutoBuild - Automatically Building Multi-agent Systems</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/20/AgentEval">How to Assess Utility of LLM-powered Applications?</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/13/OAI-assistants">AutoGen Meets GPTs</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/09/EcoAssistant">EcoAssistant - Using LLM Assistants More Accurately and Affordably</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/06/LMM-Agent">Multimodal with GPT-4V and LLaVA</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/10/26/TeachableAgent">AutoGen's Teachable Agents</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/10/18/RetrieveChat">Retrieval-Augmented Generation (RAG) Applications with AutoGen</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/07/14/Local-LLMs">Use AutoGen for Local LLMs</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/06/28/MathChat">MathChat - An Conversational Framework to Solve Math Problems</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/05/18/GPT-adaptive-humaneval">Achieve More, Pay Less - Use GPT-4 Smartly</a></li><li class="sidebarItem__DBe"><a aria-current="page" class="sidebarItemLink_mo7H sidebarItemLinkActive_I1ZP" href="/autogen/blog/2023/04/21/LLM-tuning-math">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH</a></li></ul></nav></aside><main class="col col--7" itemscope="" itemtype="https://schema.org/Blog"><article itemprop="blogPost" itemscope="" itemtype="https://schema.org/BlogPosting"><meta itemprop="description" content="level 2 algebra"><header><h1 class="title_f1Hy" itemprop="headline">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH</h1><div class="container_mt6G margin-vert--md"><time datetime="2023-04-21T00:00:00.000Z" itemprop="datePublished">April 21, 2023</time> · <!-- -->6 min read</div><div class="margin-top--md margin-bottom--sm row"><div class="col col--6 authorCol_Hf19"><div class="avatar margin-bottom--sm"><a href="https://www.linkedin.com/in/chi-wang-49b15b16/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link"><img class="avatar__photo" src="https://github.com/sonichi.png" alt="Chi Wang" itemprop="image"></a><div class="avatar__intro" itemprop="author" itemscope="" itemtype="https://schema.org/Person"><div class="avatar__name"><a href="https://www.linkedin.com/in/chi-wang-49b15b16/" target="_blank" rel="noopener noreferrer" itemprop="url"><span itemprop="name">Chi Wang</span></a></div><small class="avatar__subtitle" itemprop="description">Principal Researcher at Microsoft Research</small></div></div></div></div></header><div id="__blog-post-container" class="markdown" itemprop="articleBody"><p><img decoding="async" loading="lazy" alt="level 2 algebra" src="/autogen/assets/images/level2algebra-659ba95286432d9945fc89e84d606797.png" width="575" height="469" class="img_ev3q"></p> 21 | <p><strong>TL;DR:</strong></p> 22 | <ul> 23 | <li><strong>Just by tuning the inference parameters like model, number of responses, temperature etc. without changing any model weights or prompt, the baseline accuracy of untuned gpt-4 can be improved by 20% in high school math competition problems.</strong></li> 24 | <li><strong>For easy problems, the tuned gpt-3.5-turbo model vastly outperformed untuned gpt-4 in accuracy (e.g., 90% vs. 70%) and cost efficiency. For hard problems, the tuned gpt-4 is much more accurate (e.g., 35% vs. 20%) and less expensive than untuned gpt-4.</strong></li> 25 | <li><strong>AutoGen can help with model selection, parameter tuning, and cost-saving in LLM applications.</strong></li> 26 | </ul> 27 | <p>Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?</p> 28 | <p>In this blog post, we will explore how model and inference parameter matter in LLM applications, using a case study for <a href="https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html" target="_blank" rel="noopener noreferrer">MATH</a>, a benchmark for evaluating LLMs on advanced mathematical problem solving. MATH consists of 12K math competition problems from AMC-10, AMC-12 and AIME. Each problem is accompanied by a step-by-step solution.</p> 29 | <p>We will use AutoGen to automatically find the best model and inference parameter for LLMs on a given task and dataset given an inference budget, using a novel low-cost search & pruning strategy. AutoGen currently supports all the LLMs from OpenAI, such as GPT-3.5 and GPT-4.</p> 30 | <p>We will use AutoGen to perform model selection and inference parameter tuning. Then we compare the performance and inference cost on solving algebra problems with the untuned gpt-4. We will also analyze how different difficulty levels affect the results.</p> 31 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="experiment-setup">Experiment Setup<a href="#experiment-setup" class="hash-link" aria-label="Direct link to Experiment Setup" title="Direct link to Experiment Setup"></a></h2> 32 | <p>We use AutoGen to select between the following models with a target inference budget $0.02 per instance:</p> 33 | <ul> 34 | <li>gpt-3.5-turbo, a relatively cheap model that powers the popular ChatGPT app</li> 35 | <li>gpt-4, the state of the art LLM that costs more than 10 times of gpt-3.5-turbo</li> 36 | </ul> 37 | <p>We adapt the models using 20 examples in the train set, using the problem statement as the input and generating the solution as the output. We use the following inference parameters:</p> 38 | <ul> 39 | <li>temperature: The parameter that controls the randomness of the output text. A higher temperature means more diversity but less coherence. We search for the optimal temperature in the range of [0, 1].</li> 40 | <li>top_p: The parameter that controls the probability mass of the output tokens. Only tokens with a cumulative probability less than or equal to top-p are considered. A lower top-p means more diversity but less coherence. We search for the optimal top-p in the range of [0, 1].</li> 41 | <li>max_tokens: The maximum number of tokens that can be generated for each output. We search for the optimal max length in the range of [50, 1000].</li> 42 | <li>n: The number of responses to generate. We search for the optimal n in the range of [1, 100].</li> 43 | <li>prompt: We use the template: "{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \boxed{{}}." where {problem} will be replaced by the math problem instance.</li> 44 | </ul> 45 | <p>In this experiment, when n > 1, we find the answer with highest votes among all the responses and then select it as the final answer to compare with the ground truth. For example, if n = 5 and 3 of the responses contain a final answer 301 while 2 of the responses contain a final answer 159, we choose 301 as the final answer. This can help with resolving potential errors due to randomness. We use the average accuracy and average inference cost as the metric to evaluate the performance over a dataset. The inference cost of a particular instance is measured by the price per 1K tokens and the number of tokens consumed.</p> 46 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="experiment-results">Experiment Results<a href="#experiment-results" class="hash-link" aria-label="Direct link to Experiment Results" title="Direct link to Experiment Results"></a></h2> 47 | <p>The first figure in this blog post shows the average accuracy and average inference cost of each configuration on the level 2 Algebra test set.</p> 48 | <p>Surprisingly, the tuned gpt-3.5-turbo model is selected as a better model and it vastly outperforms untuned gpt-4 in accuracy (92% vs. 70%) with equal or 2.5 times higher inference budget. 49 | The same observation can be obtained on the level 3 Algebra test set.</p> 50 | <p><img decoding="async" loading="lazy" alt="level 3 algebra" src="/autogen/assets/images/level3algebra-94e87a683ac8832ac7ae6f41f30131a4.png" width="575" height="469" class="img_ev3q"></p> 51 | <p>However, the selected model changes on level 4 Algebra.</p> 52 | <p><img decoding="async" loading="lazy" alt="level 4 algebra" src="/autogen/assets/images/level4algebra-492beb22490df30d6cc258f061912dcd.png" width="580" height="469" class="img_ev3q"></p> 53 | <p>This time gpt-4 is selected as the best model. The tuned gpt-4 achieves much higher accuracy (56% vs. 44%) and lower cost than the untuned gpt-4. 54 | On level 5 the result is similar.</p> 55 | <p><img decoding="async" loading="lazy" alt="level 5 algebra" src="/autogen/assets/images/level5algebra-8fba701551334296d08580b4b489fe56.png" width="575" height="469" class="img_ev3q"></p> 56 | <p>We can see that AutoGen has found different optimal model and inference parameters for each subset of a particular level, which shows that these parameters matter in cost-sensitive LLM applications and need to be carefully tuned or adapted.</p> 57 | <p>An example notebook to run these experiments can be found at: <a href="https://github.com/microsoft/FLAML/blob/v1.2.1/notebook/autogen_chatgpt.ipynb" target="_blank" rel="noopener noreferrer">https://github.com/microsoft/FLAML/blob/v1.2.1/notebook/autogen_chatgpt.ipynb</a>. The experiments were run when AutoGen was a subpackage in FLAML.</p> 58 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="analysis-and-discussion">Analysis and Discussion<a href="#analysis-and-discussion" class="hash-link" aria-label="Direct link to Analysis and Discussion" title="Direct link to Analysis and Discussion"></a></h2> 59 | <p>While gpt-3.5-turbo demonstrates competitive accuracy with voted answers in relatively easy algebra problems under the same inference budget, gpt-4 is a better choice for the most difficult problems. In general, through parameter tuning and model selection, we can identify the opportunity to save the expensive model for more challenging tasks, and improve the overall effectiveness of a budget-constrained system.</p> 60 | <p>There are many other alternative ways of solving math problems, which we have not covered in this blog post. When there are choices beyond the inference parameters, they can be generally tuned via <a href="https://microsoft.github.io/FLAML/docs/Use-Cases/Tune-User-Defined-Function" target="_blank" rel="noopener noreferrer"><code>flaml.tune</code></a>.</p> 61 | <p>The need for model selection, parameter tuning and cost saving is not specific to the math problems. The <a href="https://github.com/Significant-Gravitas/Auto-GPT" target="_blank" rel="noopener noreferrer">Auto-GPT</a> project is an example where high cost can easily prevent a generic complex task to be accomplished as it needs many LLM inference calls.</p> 62 | <h2 class="anchor anchorWithStickyNavbar_LWe7" id="for-further-reading">For Further Reading<a href="#for-further-reading" class="hash-link" aria-label="Direct link to For Further Reading" title="Direct link to For Further Reading"></a></h2> 63 | <ul> 64 | <li><a href="https://arxiv.org/abs/2303.04673" target="_blank" rel="noopener noreferrer">Research paper about the tuning technique</a></li> 65 | <li><a href="/autogen/docs/Use-Cases/enhanced_inference">Documentation about inference tuning</a></li> 66 | </ul> 67 | <p><em>Do you have any experience to share about LLM applications? Do you like to see more support or research of LLM optimization or automation? Please join our <a href="https://discord.gg/pAbnFJrkgZ" target="_blank" rel="noopener noreferrer">Discord</a> server for discussion.</em></p></div><footer class="row docusaurus-mt-lg blogPostFooterDetailsFull_mRVl"><div class="col"><b>Tags:</b><ul class="tags_jXut padding--none margin-left--sm"><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/llm">LLM</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/gpt">GPT</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/research">research</a></li></ul></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Blog post page navigation"><a class="pagination-nav__link pagination-nav__link--prev" href="/autogen/blog/2023/05/18/GPT-adaptive-humaneval"><div class="pagination-nav__sublabel">Newer Post</div><div class="pagination-nav__label">Achieve More, Pay Less - Use GPT-4 Smartly</div></a></nav></main><div class="col col--2"><div class="tableOfContents_bqdL thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#experiment-setup" class="table-of-contents__link toc-highlight">Experiment Setup</a></li><li><a href="#experiment-results" class="table-of-contents__link toc-highlight">Experiment Results</a></li><li><a href="#analysis-and-discussion" class="table-of-contents__link toc-highlight">Analysis and Discussion</a></li><li><a href="#for-further-reading" class="table-of-contents__link toc-highlight">For Further Reading</a></li></ul></div></div></div></div></div><footer class="footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://discord.gg/pAbnFJrkgZ" target="_blank" rel="noopener noreferrer" class="footer__link-item">Discord<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://twitter.com/pyautogen" target="_blank" rel="noopener noreferrer" class="footer__link-item">Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 AutoGen Authors | <a target="_blank" style="color:#10adff" href="https://go.microsoft.com/fwlink/?LinkId=521839">Privacy and Cookies</a></div></div></div></footer></div> 68 | </body> 69 | </html> 70 | ``` -------------------------------------------------------------------------------- /packages/markitdown/src/markitdown/_markitdown.py: -------------------------------------------------------------------------------- ```python 1 | import mimetypes 2 | import os 3 | import re 4 | import sys 5 | import shutil 6 | import traceback 7 | import io 8 | from dataclasses import dataclass 9 | from importlib.metadata import entry_points 10 | from typing import Any, List, Dict, Optional, Union, BinaryIO 11 | from pathlib import Path 12 | from urllib.parse import urlparse 13 | from warnings import warn 14 | import requests 15 | import magika 16 | import charset_normalizer 17 | import codecs 18 | 19 | from ._stream_info import StreamInfo 20 | from ._uri_utils import parse_data_uri, file_uri_to_path 21 | 22 | from .converters import ( 23 | PlainTextConverter, 24 | HtmlConverter, 25 | RssConverter, 26 | WikipediaConverter, 27 | YouTubeConverter, 28 | IpynbConverter, 29 | BingSerpConverter, 30 | PdfConverter, 31 | DocxConverter, 32 | XlsxConverter, 33 | XlsConverter, 34 | PptxConverter, 35 | ImageConverter, 36 | AudioConverter, 37 | OutlookMsgConverter, 38 | ZipConverter, 39 | EpubConverter, 40 | DocumentIntelligenceConverter, 41 | CsvConverter, 42 | ) 43 | 44 | from ._base_converter import DocumentConverter, DocumentConverterResult 45 | 46 | from ._exceptions import ( 47 | FileConversionException, 48 | UnsupportedFormatException, 49 | FailedConversionAttempt, 50 | ) 51 | 52 | 53 | # Lower priority values are tried first. 54 | PRIORITY_SPECIFIC_FILE_FORMAT = ( 55 | 0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia 56 | ) 57 | PRIORITY_GENERIC_FILE_FORMAT = ( 58 | 10.0 # Near catch-all converters for mimetypes like text/*, etc. 59 | ) 60 | 61 | 62 | _plugins: Union[None, List[Any]] = None # If None, plugins have not been loaded yet. 63 | 64 | 65 | def _load_plugins() -> Union[None, List[Any]]: 66 | """Lazy load plugins, exiting early if already loaded.""" 67 | global _plugins 68 | 69 | # Skip if we've already loaded plugins 70 | if _plugins is not None: 71 | return _plugins 72 | 73 | # Load plugins 74 | _plugins = [] 75 | for entry_point in entry_points(group="markitdown.plugin"): 76 | try: 77 | _plugins.append(entry_point.load()) 78 | except Exception: 79 | tb = traceback.format_exc() 80 | warn(f"Plugin '{entry_point.name}' failed to load ... skipping:\n{tb}") 81 | 82 | return _plugins 83 | 84 | 85 | @dataclass(kw_only=True, frozen=True) 86 | class ConverterRegistration: 87 | """A registration of a converter with its priority and other metadata.""" 88 | 89 | converter: DocumentConverter 90 | priority: float 91 | 92 | 93 | class MarkItDown: 94 | """(In preview) An extremely simple text-based document reader, suitable for LLM use. 95 | This reader will convert common file-types or webpages to Markdown.""" 96 | 97 | def __init__( 98 | self, 99 | *, 100 | enable_builtins: Union[None, bool] = None, 101 | enable_plugins: Union[None, bool] = None, 102 | **kwargs, 103 | ): 104 | self._builtins_enabled = False 105 | self._plugins_enabled = False 106 | 107 | requests_session = kwargs.get("requests_session") 108 | if requests_session is None: 109 | self._requests_session = requests.Session() 110 | else: 111 | self._requests_session = requests_session 112 | 113 | self._magika = magika.Magika() 114 | 115 | # TODO - remove these (see enable_builtins) 116 | self._llm_client: Any = None 117 | self._llm_model: Union[str | None] = None 118 | self._llm_prompt: Union[str | None] = None 119 | self._exiftool_path: Union[str | None] = None 120 | self._style_map: Union[str | None] = None 121 | 122 | # Register the converters 123 | self._converters: List[ConverterRegistration] = [] 124 | 125 | if ( 126 | enable_builtins is None or enable_builtins 127 | ): # Default to True when not specified 128 | self.enable_builtins(**kwargs) 129 | 130 | if enable_plugins: 131 | self.enable_plugins(**kwargs) 132 | 133 | def enable_builtins(self, **kwargs) -> None: 134 | """ 135 | Enable and register built-in converters. 136 | Built-in converters are enabled by default. 137 | This method should only be called once, if built-ins were initially disabled. 138 | """ 139 | if not self._builtins_enabled: 140 | # TODO: Move these into converter constructors 141 | self._llm_client = kwargs.get("llm_client") 142 | self._llm_model = kwargs.get("llm_model") 143 | self._llm_prompt = kwargs.get("llm_prompt") 144 | self._exiftool_path = kwargs.get("exiftool_path") 145 | self._style_map = kwargs.get("style_map") 146 | 147 | if self._exiftool_path is None: 148 | self._exiftool_path = os.getenv("EXIFTOOL_PATH") 149 | 150 | # Still none? Check well-known paths 151 | if self._exiftool_path is None: 152 | candidate = shutil.which("exiftool") 153 | if candidate: 154 | candidate = os.path.abspath(candidate) 155 | if any( 156 | d == os.path.dirname(candidate) 157 | for d in [ 158 | "/usr/bin", 159 | "/usr/local/bin", 160 | "/opt", 161 | "/opt/bin", 162 | "/opt/local/bin", 163 | "/opt/homebrew/bin", 164 | "C:\\Windows\\System32", 165 | "C:\\Program Files", 166 | "C:\\Program Files (x86)", 167 | ] 168 | ): 169 | self._exiftool_path = candidate 170 | 171 | # Register converters for successful browsing operations 172 | # Later registrations are tried first / take higher priority than earlier registrations 173 | # To this end, the most specific converters should appear below the most generic converters 174 | self.register_converter( 175 | PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT 176 | ) 177 | self.register_converter( 178 | ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT 179 | ) 180 | self.register_converter( 181 | HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT 182 | ) 183 | self.register_converter(RssConverter()) 184 | self.register_converter(WikipediaConverter()) 185 | self.register_converter(YouTubeConverter()) 186 | self.register_converter(BingSerpConverter()) 187 | self.register_converter(DocxConverter()) 188 | self.register_converter(XlsxConverter()) 189 | self.register_converter(XlsConverter()) 190 | self.register_converter(PptxConverter()) 191 | self.register_converter(AudioConverter()) 192 | self.register_converter(ImageConverter()) 193 | self.register_converter(IpynbConverter()) 194 | self.register_converter(PdfConverter()) 195 | self.register_converter(OutlookMsgConverter()) 196 | self.register_converter(EpubConverter()) 197 | self.register_converter(CsvConverter()) 198 | 199 | # Register Document Intelligence converter at the top of the stack if endpoint is provided 200 | docintel_endpoint = kwargs.get("docintel_endpoint") 201 | if docintel_endpoint is not None: 202 | docintel_args: Dict[str, Any] = {} 203 | docintel_args["endpoint"] = docintel_endpoint 204 | 205 | docintel_credential = kwargs.get("docintel_credential") 206 | if docintel_credential is not None: 207 | docintel_args["credential"] = docintel_credential 208 | 209 | docintel_types = kwargs.get("docintel_file_types") 210 | if docintel_types is not None: 211 | docintel_args["file_types"] = docintel_types 212 | 213 | docintel_version = kwargs.get("docintel_api_version") 214 | if docintel_version is not None: 215 | docintel_args["api_version"] = docintel_version 216 | 217 | self.register_converter( 218 | DocumentIntelligenceConverter(**docintel_args), 219 | ) 220 | 221 | self._builtins_enabled = True 222 | else: 223 | warn("Built-in converters are already enabled.", RuntimeWarning) 224 | 225 | def enable_plugins(self, **kwargs) -> None: 226 | """ 227 | Enable and register converters provided by plugins. 228 | Plugins are disabled by default. 229 | This method should only be called once, if plugins were initially disabled. 230 | """ 231 | if not self._plugins_enabled: 232 | # Load plugins 233 | plugins = _load_plugins() 234 | assert plugins is not None 235 | for plugin in plugins: 236 | try: 237 | plugin.register_converters(self, **kwargs) 238 | except Exception: 239 | tb = traceback.format_exc() 240 | warn(f"Plugin '{plugin}' failed to register converters:\n{tb}") 241 | self._plugins_enabled = True 242 | else: 243 | warn("Plugins converters are already enabled.", RuntimeWarning) 244 | 245 | def convert( 246 | self, 247 | source: Union[str, requests.Response, Path, BinaryIO], 248 | *, 249 | stream_info: Optional[StreamInfo] = None, 250 | **kwargs: Any, 251 | ) -> DocumentConverterResult: # TODO: deal with kwargs 252 | """ 253 | Args: 254 | - source: can be a path (str or Path), url, or a requests.response object 255 | - stream_info: optional stream info to use for the conversion. If None, infer from source 256 | - kwargs: additional arguments to pass to the converter 257 | """ 258 | 259 | # Local path or url 260 | if isinstance(source, str): 261 | if ( 262 | source.startswith("http:") 263 | or source.startswith("https:") 264 | or source.startswith("file:") 265 | or source.startswith("data:") 266 | ): 267 | # Rename the url argument to mock_url 268 | # (Deprecated -- use stream_info) 269 | _kwargs = {k: v for k, v in kwargs.items()} 270 | if "url" in _kwargs: 271 | _kwargs["mock_url"] = _kwargs["url"] 272 | del _kwargs["url"] 273 | 274 | return self.convert_uri(source, stream_info=stream_info, **_kwargs) 275 | else: 276 | return self.convert_local(source, stream_info=stream_info, **kwargs) 277 | # Path object 278 | elif isinstance(source, Path): 279 | return self.convert_local(source, stream_info=stream_info, **kwargs) 280 | # Request response 281 | elif isinstance(source, requests.Response): 282 | return self.convert_response(source, stream_info=stream_info, **kwargs) 283 | # Binary stream 284 | elif ( 285 | hasattr(source, "read") 286 | and callable(source.read) 287 | and not isinstance(source, io.TextIOBase) 288 | ): 289 | return self.convert_stream(source, stream_info=stream_info, **kwargs) 290 | else: 291 | raise TypeError( 292 | f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO." 293 | ) 294 | 295 | def convert_local( 296 | self, 297 | path: Union[str, Path], 298 | *, 299 | stream_info: Optional[StreamInfo] = None, 300 | file_extension: Optional[str] = None, # Deprecated -- use stream_info 301 | url: Optional[str] = None, # Deprecated -- use stream_info 302 | **kwargs: Any, 303 | ) -> DocumentConverterResult: 304 | if isinstance(path, Path): 305 | path = str(path) 306 | 307 | # Build a base StreamInfo object from which to start guesses 308 | base_guess = StreamInfo( 309 | local_path=path, 310 | extension=os.path.splitext(path)[1], 311 | filename=os.path.basename(path), 312 | ) 313 | 314 | # Extend the base_guess with any additional info from the arguments 315 | if stream_info is not None: 316 | base_guess = base_guess.copy_and_update(stream_info) 317 | 318 | if file_extension is not None: 319 | # Deprecated -- use stream_info 320 | base_guess = base_guess.copy_and_update(extension=file_extension) 321 | 322 | if url is not None: 323 | # Deprecated -- use stream_info 324 | base_guess = base_guess.copy_and_update(url=url) 325 | 326 | with open(path, "rb") as fh: 327 | guesses = self._get_stream_info_guesses( 328 | file_stream=fh, base_guess=base_guess 329 | ) 330 | return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs) 331 | 332 | def convert_stream( 333 | self, 334 | stream: BinaryIO, 335 | *, 336 | stream_info: Optional[StreamInfo] = None, 337 | file_extension: Optional[str] = None, # Deprecated -- use stream_info 338 | url: Optional[str] = None, # Deprecated -- use stream_info 339 | **kwargs: Any, 340 | ) -> DocumentConverterResult: 341 | guesses: List[StreamInfo] = [] 342 | 343 | # Do we have anything on which to base a guess? 344 | base_guess = None 345 | if stream_info is not None or file_extension is not None or url is not None: 346 | # Start with a non-Null base guess 347 | if stream_info is None: 348 | base_guess = StreamInfo() 349 | else: 350 | base_guess = stream_info 351 | 352 | if file_extension is not None: 353 | # Deprecated -- use stream_info 354 | assert base_guess is not None # for mypy 355 | base_guess = base_guess.copy_and_update(extension=file_extension) 356 | 357 | if url is not None: 358 | # Deprecated -- use stream_info 359 | assert base_guess is not None # for mypy 360 | base_guess = base_guess.copy_and_update(url=url) 361 | 362 | # Check if we have a seekable stream. If not, load the entire stream into memory. 363 | if not stream.seekable(): 364 | buffer = io.BytesIO() 365 | while True: 366 | chunk = stream.read(4096) 367 | if not chunk: 368 | break 369 | buffer.write(chunk) 370 | buffer.seek(0) 371 | stream = buffer 372 | 373 | # Add guesses based on stream content 374 | guesses = self._get_stream_info_guesses( 375 | file_stream=stream, base_guess=base_guess or StreamInfo() 376 | ) 377 | return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs) 378 | 379 | def convert_url( 380 | self, 381 | url: str, 382 | *, 383 | stream_info: Optional[StreamInfo] = None, 384 | file_extension: Optional[str] = None, 385 | mock_url: Optional[str] = None, 386 | **kwargs: Any, 387 | ) -> DocumentConverterResult: 388 | """Alias for convert_uri()""" 389 | # convert_url will likely be deprecated in the future in favor of convert_uri 390 | return self.convert_uri( 391 | url, 392 | stream_info=stream_info, 393 | file_extension=file_extension, 394 | mock_url=mock_url, 395 | **kwargs, 396 | ) 397 | 398 | def convert_uri( 399 | self, 400 | uri: str, 401 | *, 402 | stream_info: Optional[StreamInfo] = None, 403 | file_extension: Optional[str] = None, # Deprecated -- use stream_info 404 | mock_url: Optional[ 405 | str 406 | ] = None, # Mock the request as if it came from a different URL 407 | **kwargs: Any, 408 | ) -> DocumentConverterResult: 409 | uri = uri.strip() 410 | 411 | # File URIs 412 | if uri.startswith("file:"): 413 | netloc, path = file_uri_to_path(uri) 414 | if netloc and netloc != "localhost": 415 | raise ValueError( 416 | f"Unsupported file URI: {uri}. Netloc must be empty or localhost." 417 | ) 418 | return self.convert_local( 419 | path, 420 | stream_info=stream_info, 421 | file_extension=file_extension, 422 | url=mock_url, 423 | **kwargs, 424 | ) 425 | # Data URIs 426 | elif uri.startswith("data:"): 427 | mimetype, attributes, data = parse_data_uri(uri) 428 | 429 | base_guess = StreamInfo( 430 | mimetype=mimetype, 431 | charset=attributes.get("charset"), 432 | ) 433 | if stream_info is not None: 434 | base_guess = base_guess.copy_and_update(stream_info) 435 | 436 | return self.convert_stream( 437 | io.BytesIO(data), 438 | stream_info=base_guess, 439 | file_extension=file_extension, 440 | url=mock_url, 441 | **kwargs, 442 | ) 443 | # HTTP/HTTPS URIs 444 | elif uri.startswith("http:") or uri.startswith("https:"): 445 | response = self._requests_session.get(uri, stream=True) 446 | response.raise_for_status() 447 | return self.convert_response( 448 | response, 449 | stream_info=stream_info, 450 | file_extension=file_extension, 451 | url=mock_url, 452 | **kwargs, 453 | ) 454 | else: 455 | raise ValueError( 456 | f"Unsupported URI scheme: {uri.split(':')[0]}. Supported schemes are: file:, data:, http:, https:" 457 | ) 458 | 459 | def convert_response( 460 | self, 461 | response: requests.Response, 462 | *, 463 | stream_info: Optional[StreamInfo] = None, 464 | file_extension: Optional[str] = None, # Deprecated -- use stream_info 465 | url: Optional[str] = None, # Deprecated -- use stream_info 466 | **kwargs: Any, 467 | ) -> DocumentConverterResult: 468 | # If there is a content-type header, get the mimetype and charset (if present) 469 | mimetype: Optional[str] = None 470 | charset: Optional[str] = None 471 | 472 | if "content-type" in response.headers: 473 | parts = response.headers["content-type"].split(";") 474 | mimetype = parts.pop(0).strip() 475 | for part in parts: 476 | if part.strip().startswith("charset="): 477 | _charset = part.split("=")[1].strip() 478 | if len(_charset) > 0: 479 | charset = _charset 480 | 481 | # If there is a content-disposition header, get the filename and possibly the extension 482 | filename: Optional[str] = None 483 | extension: Optional[str] = None 484 | if "content-disposition" in response.headers: 485 | m = re.search(r"filename=([^;]+)", response.headers["content-disposition"]) 486 | if m: 487 | filename = m.group(1).strip("\"'") 488 | _, _extension = os.path.splitext(filename) 489 | if len(_extension) > 0: 490 | extension = _extension 491 | 492 | # If there is still no filename, try to read it from the url 493 | if filename is None: 494 | parsed_url = urlparse(response.url) 495 | _, _extension = os.path.splitext(parsed_url.path) 496 | if len(_extension) > 0: # Looks like this might be a file! 497 | filename = os.path.basename(parsed_url.path) 498 | extension = _extension 499 | 500 | # Create an initial guess from all this information 501 | base_guess = StreamInfo( 502 | mimetype=mimetype, 503 | charset=charset, 504 | filename=filename, 505 | extension=extension, 506 | url=response.url, 507 | ) 508 | 509 | # Update with any additional info from the arguments 510 | if stream_info is not None: 511 | base_guess = base_guess.copy_and_update(stream_info) 512 | if file_extension is not None: 513 | # Deprecated -- use stream_info 514 | base_guess = base_guess.copy_and_update(extension=file_extension) 515 | if url is not None: 516 | # Deprecated -- use stream_info 517 | base_guess = base_guess.copy_and_update(url=url) 518 | 519 | # Read into BytesIO 520 | buffer = io.BytesIO() 521 | for chunk in response.iter_content(chunk_size=512): 522 | buffer.write(chunk) 523 | buffer.seek(0) 524 | 525 | # Convert 526 | guesses = self._get_stream_info_guesses( 527 | file_stream=buffer, base_guess=base_guess 528 | ) 529 | return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs) 530 | 531 | def _convert( 532 | self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs 533 | ) -> DocumentConverterResult: 534 | res: Union[None, DocumentConverterResult] = None 535 | 536 | # Keep track of which converters throw exceptions 537 | failed_attempts: List[FailedConversionAttempt] = [] 538 | 539 | # Create a copy of the page_converters list, sorted by priority. 540 | # We do this with each call to _convert because the priority of converters may change between calls. 541 | # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order. 542 | sorted_registrations = sorted(self._converters, key=lambda x: x.priority) 543 | 544 | # Remember the initial stream position so that we can return to it 545 | cur_pos = file_stream.tell() 546 | 547 | for stream_info in stream_info_guesses + [StreamInfo()]: 548 | for converter_registration in sorted_registrations: 549 | converter = converter_registration.converter 550 | # Sanity check -- make sure the cur_pos is still the same 551 | assert ( 552 | cur_pos == file_stream.tell() 553 | ), "File stream position should NOT change between guess iterations" 554 | 555 | _kwargs = {k: v for k, v in kwargs.items()} 556 | 557 | # Copy any additional global options 558 | if "llm_client" not in _kwargs and self._llm_client is not None: 559 | _kwargs["llm_client"] = self._llm_client 560 | 561 | if "llm_model" not in _kwargs and self._llm_model is not None: 562 | _kwargs["llm_model"] = self._llm_model 563 | 564 | if "llm_prompt" not in _kwargs and self._llm_prompt is not None: 565 | _kwargs["llm_prompt"] = self._llm_prompt 566 | 567 | if "style_map" not in _kwargs and self._style_map is not None: 568 | _kwargs["style_map"] = self._style_map 569 | 570 | if "exiftool_path" not in _kwargs and self._exiftool_path is not None: 571 | _kwargs["exiftool_path"] = self._exiftool_path 572 | 573 | # Add the list of converters for nested processing 574 | _kwargs["_parent_converters"] = self._converters 575 | 576 | # Add legaxy kwargs 577 | if stream_info is not None: 578 | if stream_info.extension is not None: 579 | _kwargs["file_extension"] = stream_info.extension 580 | 581 | if stream_info.url is not None: 582 | _kwargs["url"] = stream_info.url 583 | 584 | # Check if the converter will accept the file, and if so, try to convert it 585 | _accepts = False 586 | try: 587 | _accepts = converter.accepts(file_stream, stream_info, **_kwargs) 588 | except NotImplementedError: 589 | pass 590 | 591 | # accept() should not have changed the file stream position 592 | assert ( 593 | cur_pos == file_stream.tell() 594 | ), f"{type(converter).__name__}.accept() should NOT change the file_stream position" 595 | 596 | # Attempt the conversion 597 | if _accepts: 598 | try: 599 | res = converter.convert(file_stream, stream_info, **_kwargs) 600 | except Exception: 601 | failed_attempts.append( 602 | FailedConversionAttempt( 603 | converter=converter, exc_info=sys.exc_info() 604 | ) 605 | ) 606 | finally: 607 | file_stream.seek(cur_pos) 608 | 609 | if res is not None: 610 | # Normalize the content 611 | res.text_content = "\n".join( 612 | [line.rstrip() for line in re.split(r"\r?\n", res.text_content)] 613 | ) 614 | res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content) 615 | return res 616 | 617 | # If we got this far without success, report any exceptions 618 | if len(failed_attempts) > 0: 619 | raise FileConversionException(attempts=failed_attempts) 620 | 621 | # Nothing can handle it! 622 | raise UnsupportedFormatException( 623 | "Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported." 624 | ) 625 | 626 | def register_page_converter(self, converter: DocumentConverter) -> None: 627 | """DEPRECATED: User register_converter instead.""" 628 | warn( 629 | "register_page_converter is deprecated. Use register_converter instead.", 630 | DeprecationWarning, 631 | ) 632 | self.register_converter(converter) 633 | 634 | def register_converter( 635 | self, 636 | converter: DocumentConverter, 637 | *, 638 | priority: float = PRIORITY_SPECIFIC_FILE_FORMAT, 639 | ) -> None: 640 | """ 641 | Register a DocumentConverter with a given priority. 642 | 643 | Priorities work as follows: By default, most converters get priority 644 | DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception 645 | is the PlainTextConverter, HtmlConverter, and ZipConverter, which get 646 | priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values 647 | being tried first (i.e., higher priority). 648 | 649 | Just prior to conversion, the converters are sorted by priority, using 650 | a stable sort. This means that converters with the same priority will 651 | remain in the same order, with the most recently registered converters 652 | appearing first. 653 | 654 | We have tight control over the order of built-in converters, but 655 | plugins can register converters in any order. The registration's priority 656 | field reasserts some control over the order of converters. 657 | 658 | Plugins can register converters with any priority, to appear before or 659 | after the built-ins. For example, a plugin with priority 9 will run 660 | before the PlainTextConverter, but after the built-in converters. 661 | """ 662 | self._converters.insert( 663 | 0, ConverterRegistration(converter=converter, priority=priority) 664 | ) 665 | 666 | def _get_stream_info_guesses( 667 | self, file_stream: BinaryIO, base_guess: StreamInfo 668 | ) -> List[StreamInfo]: 669 | """ 670 | Given a base guess, attempt to guess or expand on the stream info using the stream content (via magika). 671 | """ 672 | guesses: List[StreamInfo] = [] 673 | 674 | # Enhance the base guess with information based on the extension or mimetype 675 | enhanced_guess = base_guess.copy_and_update() 676 | 677 | # If there's an extension and no mimetype, try to guess the mimetype 678 | if base_guess.mimetype is None and base_guess.extension is not None: 679 | _m, _ = mimetypes.guess_type( 680 | "placeholder" + base_guess.extension, strict=False 681 | ) 682 | if _m is not None: 683 | enhanced_guess = enhanced_guess.copy_and_update(mimetype=_m) 684 | 685 | # If there's a mimetype and no extension, try to guess the extension 686 | if base_guess.mimetype is not None and base_guess.extension is None: 687 | _e = mimetypes.guess_all_extensions(base_guess.mimetype, strict=False) 688 | if len(_e) > 0: 689 | enhanced_guess = enhanced_guess.copy_and_update(extension=_e[0]) 690 | 691 | # Call magika to guess from the stream 692 | cur_pos = file_stream.tell() 693 | try: 694 | result = self._magika.identify_stream(file_stream) 695 | if result.status == "ok" and result.prediction.output.label != "unknown": 696 | # If it's text, also guess the charset 697 | charset = None 698 | if result.prediction.output.is_text: 699 | # Read the first 4k to guess the charset 700 | file_stream.seek(cur_pos) 701 | stream_page = file_stream.read(4096) 702 | charset_result = charset_normalizer.from_bytes(stream_page).best() 703 | 704 | if charset_result is not None: 705 | charset = self._normalize_charset(charset_result.encoding) 706 | 707 | # Normalize the first extension listed 708 | guessed_extension = None 709 | if len(result.prediction.output.extensions) > 0: 710 | guessed_extension = "." + result.prediction.output.extensions[0] 711 | 712 | # Determine if the guess is compatible with the base guess 713 | compatible = True 714 | if ( 715 | base_guess.mimetype is not None 716 | and base_guess.mimetype != result.prediction.output.mime_type 717 | ): 718 | compatible = False 719 | 720 | if ( 721 | base_guess.extension is not None 722 | and base_guess.extension.lstrip(".") 723 | not in result.prediction.output.extensions 724 | ): 725 | compatible = False 726 | 727 | if ( 728 | base_guess.charset is not None 729 | and self._normalize_charset(base_guess.charset) != charset 730 | ): 731 | compatible = False 732 | 733 | if compatible: 734 | # Add the compatible base guess 735 | guesses.append( 736 | StreamInfo( 737 | mimetype=base_guess.mimetype 738 | or result.prediction.output.mime_type, 739 | extension=base_guess.extension or guessed_extension, 740 | charset=base_guess.charset or charset, 741 | filename=base_guess.filename, 742 | local_path=base_guess.local_path, 743 | url=base_guess.url, 744 | ) 745 | ) 746 | else: 747 | # The magika guess was incompatible with the base guess, so add both guesses 748 | guesses.append(enhanced_guess) 749 | guesses.append( 750 | StreamInfo( 751 | mimetype=result.prediction.output.mime_type, 752 | extension=guessed_extension, 753 | charset=charset, 754 | filename=base_guess.filename, 755 | local_path=base_guess.local_path, 756 | url=base_guess.url, 757 | ) 758 | ) 759 | else: 760 | # There were no other guesses, so just add the base guess 761 | guesses.append(enhanced_guess) 762 | finally: 763 | file_stream.seek(cur_pos) 764 | 765 | return guesses 766 | 767 | def _normalize_charset(self, charset: str | None) -> str | None: 768 | """ 769 | Normalize a charset string to a canonical form. 770 | """ 771 | if charset is None: 772 | return None 773 | try: 774 | return codecs.lookup(charset).name 775 | except LookupError: 776 | return charset 777 | ```